# Stocks with spaCy

In [1]:
import spacy
import pandas as pd



In [3]:
df = pd.read_csv("stocks.tsv", sep='\t')
df.head()

Unnamed: 0,Symbol,CompanyName,Industry,MarketCap
0,A,Agilent Technologies,Life Sciences Tools & Services,53.65B
1,AA,Alcoa,Metals & Mining,9.25B
2,AAC,Ares Acquisition,Shell Companies,1.22B
3,AACG,ATA Creativity Global,Diversified Consumer Services,90.35M
4,AADI,Aadi Bioscience,Pharmaceuticals,104.85M


In [6]:
# convert to list
symbols = df.Symbol.tolist()
companies = df.CompanyName.tolist()

print(symbols[:5])
print(companies[:5])

['A', 'AA', 'AAC', 'AACG', 'AADI']
['Agilent Technologies', 'Alcoa', 'Ares Acquisition', 'ATA Creativity Global', 'Aadi Bioscience']


In [7]:
#source: https://www.reuters.com/business/futures-rise-after-biden-xi-call-oil-bounce-2021-09-10/
text = '''
Sept 10 (Reuters) - Wall Street's main indexes were subdued on Friday as signs of higher inflation and a drop in Apple shares following an unfavorable court ruling offset expectations of an easing in U.S.-China tensions.

Data earlier in the day showed U.S. producer prices rose solidly in August, leading to the biggest annual gain in nearly 11 years and indicating that high inflation was likely to persist as the pandemic pressures supply chains. read more .

"Today's data on wholesale prices should be eye-opening for the Federal Reserve, as inflation pressures still don't appear to be easing and will likely continue to be felt by the consumer in the coming months," said Charlie Ripley, senior investment strategist for Allianz Investment Management.

Apple Inc (AAPL.O) fell 2.7% following a U.S. court ruling in "Fortnite" creator Epic Games' antitrust lawsuit that stroke down some of the iPhone maker's restrictions on how developers can collect payments in apps.


Sponsored by Advertising Partner
Sponsored Video
Watch to learn more
Report ad
Apple shares were set for their worst single-day fall since May this year, weighing on the Nasdaq (.IXIC) and the S&P 500 technology sub-index (.SPLRCT), which fell 0.1%.

Sentiment also took a hit from Cleveland Federal Reserve Bank President Loretta Mester's comments that she would still like the central bank to begin tapering asset purchases this year despite the weak August jobs report. read more

Investors have paid keen attention to the labor market and data hinting towards higher inflation recently for hints on a timeline for the Federal Reserve to begin tapering its massive bond-buying program.

The S&P 500 has risen around 19% so far this year on support from dovish central bank policies and re-opening optimism, but concerns over rising coronavirus infections and accelerating inflation have lately stalled its advance.


Report ad
The three main U.S. indexes got some support on Friday from news of a phone call between U.S. President Joe Biden and Chinese leader Xi Jinping that was taken as a positive sign which could bring a thaw in ties between the world's two most important trading partners.

At 1:01 p.m. ET, the Dow Jones Industrial Average (.DJI) was up 12.24 points, or 0.04%, at 34,891.62, the S&P 500 (.SPX) was up 2.83 points, or 0.06%, at 4,496.11, and the Nasdaq Composite (.IXIC) was up 12.85 points, or 0.08%, at 15,261.11.

Six of the eleven S&P 500 sub-indexes gained, with energy (.SPNY), materials (.SPLRCM) and consumer discretionary stocks (.SPLRCD) rising the most.

U.S.-listed Chinese e-commerce companies Alibaba and JD.com , music streaming company Tencent Music (TME.N) and electric car maker Nio Inc (NIO.N) all gained between 0.7% and 1.4%


Report ad
Grocer Kroger Co (KR.N) dropped 7.1% after it said global supply chain disruptions, freight costs, discounts and wastage would hit its profit margins.

Advancing issues outnumbered decliners by a 1.12-to-1 ratio on the NYSE and by a 1.02-to-1 ratio on the Nasdaq.

The S&P index recorded 14 new 52-week highs and three new lows, while the Nasdaq recorded 49 new highs and 38 new lows.
'''

## Add Companies

In [13]:
nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")

patterns = []
#List of Entities and Patterns
for company in companies:
    patterns.append({"label": "COMPANY", "pattern": company})
ruler.add_patterns(patterns)

print(len(patterns))

5879


In [14]:
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple COMPANY
Apple COMPANY
Apple COMPANY
Nasdaq COMPANY
two COMPANY
Nasdaq COMPANY
JD.com COMPANY
Kroger COMPANY
Nasdaq COMPANY
Nasdaq COMPANY


### remove company "two"

In [19]:
stop_words = ['two']
nlp = spacy.blank('en')
ruler = nlp.add_pipe("entity_ruler")

patterns = []
#List of Entities and Patterns
for company in companies:
    if company not in stop_words:
        patterns.append({'label':'COMPANY', 'pattern':company})
ruler.add_patterns(patterns)

print('# patterns:',len(patterns), '\n')
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

# patterns: 5878 

Apple COMPANY
Apple COMPANY
Apple COMPANY
Nasdaq COMPANY
Nasdaq COMPANY
JD.com COMPANY
Kroger COMPANY
Nasdaq COMPANY
Nasdaq COMPANY


## Add Sympols

In [23]:
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
stop_words = ['two']
nlp = spacy.blank('en')
ruler = nlp.add_pipe("entity_ruler")

patterns = []
#List of Entities and Patterns
for company in companies:
    if company not in stop_words:
        patterns.append({'label':'COMPANY', 'pattern':company})

for symbol in symbols:
    patterns.append({"label": "STOCK", "pattern": symbol})
    for l in letters:
        patterns.append({"label": "STOCK", "pattern": symbol+f".{l}"})

ruler.add_patterns(patterns)

print('# patterns:',len(patterns), '\n')
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

# patterns: 164611 

Apple COMPANY
Apple COMPANY
AAPL.O STOCK
Apple COMPANY
Nasdaq COMPANY
ET STOCK
Nasdaq COMPANY
JD.com COMPANY
TME.N STOCK
NIO.N STOCK
Kroger COMPANY
KR.N STOCK
Nasdaq COMPANY
Nasdaq COMPANY


## Add Stock Indexes

In [28]:
df2 = pd.read_csv("indexes.tsv", sep="\t")
df2

Unnamed: 0,IndexName,IndexSymbol
0,Dow Jones Industrial Average,DJIA
1,Dow Jones Transportation Average,DJT
2,Dow Jones Utility Average Index,DJU
3,NASDAQ 100 Index (NASDAQ Calculation),NDX
4,NASDAQ Composite Index,COMP
5,NYSE Composite Index,NYA
6,S&P 500 Index,SPX
7,S&P 400 Mid Cap Index,MID
8,S&P 100 Index,OEX
9,NASDAQ Computer Index,IXCO


In [31]:
# convert to list
indexes = df2.IndexName.tolist()
index_symbols = df2.IndexSymbol.tolist()

print(indexes[:3])
print(index_symbols[:5])

['Dow Jones Industrial Average', 'Dow Jones Transportation Average', 'Dow Jones Utility Average Index']
['DJIA', 'DJT', 'DJU', 'NDX', 'COMP']


### Add Stock Symbol indexes

In [32]:
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
stop_words = ['two']
nlp = spacy.blank('en')
ruler = nlp.add_pipe("entity_ruler")

patterns = []
#List of Entities and Patterns
for company in companies:
    if company not in stop_words:
        patterns.append({'label':'COMPANY', 'pattern':company})

for symbol in symbols:
    patterns.append({"label": "STOCK", "pattern": symbol})
    for l in letters:
        patterns.append({"label": "STOCK", "pattern": symbol+f".{l}"})

for index in index_symbols:
    patterns.append({"label": "INDEX", "pattern": symbol})

ruler.add_patterns(patterns)

print('# patterns:',len(patterns), '\n')
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

# patterns: 164625 

Apple COMPANY
Apple COMPANY
AAPL.O STOCK
Apple COMPANY
Nasdaq COMPANY
ET STOCK
Nasdaq COMPANY
JD.com COMPANY
TME.N STOCK
NIO.N STOCK
Kroger COMPANY
KR.N STOCK
Nasdaq COMPANY
Nasdaq COMPANY


### Add Stock indexes

In [33]:
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
stop_words = ['two']
nlp = spacy.blank('en')
ruler = nlp.add_pipe("entity_ruler")

patterns = []
#List of Entities and Patterns
for company in companies:
    if company not in stop_words:
        patterns.append({'label':'COMPANY', 'pattern':company})

for symbol in symbols:
    patterns.append({"label": "STOCK", "pattern": symbol})
    for l in letters:
        patterns.append({"label": "STOCK", "pattern": symbol+f".{l}"})

for index in index_symbols:
    patterns.append({"label": "INDEX", "pattern": symbol})

for index in indexes:
    patterns.append({"label": "INDEX", "pattern": index})

ruler.add_patterns(patterns)

print('# patterns:',len(patterns), '\n')
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

# patterns: 164639 

Apple COMPANY
Apple COMPANY
AAPL.O STOCK
Apple COMPANY
Nasdaq COMPANY
ET STOCK
Dow Jones Industrial Average INDEX
Nasdaq COMPANY
JD.com COMPANY
TME.N STOCK
NIO.N STOCK
Kroger COMPANY
KR.N STOCK
Nasdaq COMPANY
Nasdaq COMPANY


#### Add Stock indexes versions

In [34]:
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
stop_words = ['two']
nlp = spacy.blank('en')
ruler = nlp.add_pipe("entity_ruler")

patterns = []
#List of Entities and Patterns
for company in companies:
    if company not in stop_words:
        patterns.append({'label':'COMPANY', 'pattern':company})

for symbol in symbols:
    patterns.append({"label": "STOCK", "pattern": symbol})
    for l in letters:
        patterns.append({"label": "STOCK", "pattern": symbol+f".{l}"})

for index in index_symbols:
    patterns.append({"label": "INDEX", "pattern": symbol})

for index in indexes:
    patterns.append({"label": "INDEX", "pattern": index})
    versions = []
    words = index.split()
    caps = []
    for word in words:
        word = word.lower().capitalize()
        caps.append(word)
    versions.append(" ".join(caps))
    versions.append(words[0])
    versions.append(caps[0])
    versions.append(" ".join(caps[:2]))
    versions.append(" ".join(words[:2]))
    for version in versions:
        if version != "NYSE":
            patterns.append({"label": "INDEX", "pattern": version})

ruler.add_patterns(patterns)

print('# patterns:',len(patterns), '\n')
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

# patterns: 164707 

Apple COMPANY
Apple COMPANY
AAPL.O STOCK
Apple COMPANY
Nasdaq COMPANY
S&P 500 INDEX
S&P 500 INDEX
ET STOCK
Dow Jones Industrial Average INDEX
S&P 500 INDEX
Nasdaq Composite INDEX
S&P 500 INDEX
JD.com COMPANY
TME.N STOCK
NIO.N STOCK
Kroger COMPANY
KR.N STOCK
Nasdaq INDEX
S&P INDEX
Nasdaq INDEX


## Descriptions

In [36]:
df3 = pd.read_csv("stock_exchanges.tsv", sep="\t")
df3

Unnamed: 0,BloombergExchangeCode,BloombergCompositeCode,Country,Description,ISOMIC,Google Prefix,EODcode,NumStocks
0,AF,AR,Argentina,Bolsa de Comercio de Buenos Aires,XBUE,,BA,12
1,AO,AU,Australia,National Stock Exchange of Australia,XNEC,,,1
2,AT,AU,Australia,Asx - All Markets,XASX,ASX,AU,875
3,AV,,Austria,Wiener Boerse Ag,XWBO,VIE,VI,38
4,BI,,Bahrain,Bahrain Bourse,XBAH,,,4
...,...,...,...,...,...,...,...,...
97,UR,US,USA,NASDAQ Capital Market,XNCM,NASDAQ,US,2209
98,UV,US,USA,OTC markets,OOTC,OTCMKTS,US,2433
99,UW,US,USA,NASDAQ Global Select,XNGS,NASDAQ,US,1768
100,VH,VN,Vietnam,Hanoi Stock Exchange,HSTC,,,4


In [38]:
# convert to list
exchanges = df3.ISOMIC.tolist()+df3["Google Prefix"].tolist()
descriptions = df3.Description.tolist()

print(exchanges[:5])
print(descriptions[:3])

['XBUE', 'XNEC', 'XASX', 'XWBO', 'XBAH']
['Bolsa de Comercio de Buenos Aires', 'National Stock Exchange of Australia', 'Asx - All Markets']


### Add Stock Exchange

In [40]:
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
stop_words = ['two']
nlp = spacy.blank('en')
ruler = nlp.add_pipe("entity_ruler")

patterns = []
#List of Entities and Patterns
for company in companies:
    if company not in stop_words:
        patterns.append({'label':'COMPANY', 'pattern':company})

for symbol in symbols:
    patterns.append({"label": "STOCK", "pattern": symbol})
    for l in letters:
        patterns.append({"label": "STOCK", "pattern": symbol+f".{l}"})

for index in index_symbols:
    patterns.append({"label": "INDEX", "pattern": symbol})

for index in indexes:
    patterns.append({"label": "INDEX", "pattern": index})
    versions = []
    words = index.split()
    caps = []
    for word in words:
        word = word.lower().capitalize()
        caps.append(word)
    versions.append(" ".join(caps))
    versions.append(words[0])
    versions.append(caps[0])
    versions.append(" ".join(caps[:2]))
    versions.append(" ".join(words[:2]))
    for version in versions:
        if version != "NYSE":
            patterns.append({"label": "INDEX", "pattern": version})

for d in descriptions:
    patterns.append({"label": "STOCK_EXCHANGE", "pattern": d})
for e in exchanges:
    patterns.append({"label": "STOCK_EXCHANGE", "pattern": e})

ruler.add_patterns(patterns)

print('# patterns:',len(patterns), '\n')
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

# patterns: 165013 

Apple COMPANY
Apple COMPANY
AAPL.O STOCK
Apple COMPANY
Nasdaq COMPANY
S&P 500 INDEX
S&P 500 INDEX
ET STOCK
Dow Jones Industrial Average INDEX
S&P 500 INDEX
Nasdaq Composite INDEX
S&P 500 INDEX
JD.com COMPANY
TME.N STOCK
NIO.N STOCK
Kroger COMPANY
KR.N STOCK
NYSE STOCK_EXCHANGE
Nasdaq INDEX
S&P INDEX
Nasdaq INDEX


In [41]:
#source: https://www.reuters.com/companies/AAPL.O
text2 = '''
Apple Inc. designs, manufactures and markets smartphones, personal computers, tablets, wearables and accessories, and sells a variety of related services. The Company’s products include iPhone, Mac, iPad, and Wearables, Home and Accessories. iPhone is the Company’s line of smartphones based on its iOS operating system. Mac is the Company’s line of personal computers based on its macOS operating system. iPad is the Company’s line of multi-purpose tablets based on its iPadOS operating system. Wearables, Home and Accessories includes AirPods, Apple TV, Apple Watch, Beats products, HomePod, iPod touch and other Apple-branded and third-party accessories. AirPods are the Company’s wireless headphones that interact with Siri. Apple Watch is the Company’s line of smart watches. Its services include Advertising, AppleCare, Cloud Services, Digital Content and Payment Services. Its customers are primarily in the consumer, small and mid-sized business, education, enterprise and government markets.
'''

In [42]:
doc2 = nlp(text2)
for ent in doc2.ents:
    print(ent.text, ent.label_)

Apple COMPANY
Apple COMPANY
TV STOCK
Apple COMPANY
Apple COMPANY
Apple COMPANY
