In [1]:
!pip install spacy # instaqll spacy
!python -m spacy download en_core_web_sm
import spacy # import dictionary
nlp = spacy.load('en_core_web_sm') # load language processor
from spacy.pipeline import EntityRuler

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


### Load the Dataset:

In [None]:
import pandas as pd # import pandas to for dataframe
df = pd.read_csv('stocks-1.tsv',sep='\t') # create dataframe from tsv file - not csv

In [None]:
df # return dataframe

Unnamed: 0,Symbol,CompanyName,Industry,MarketCap
0,A,Agilent Technologies,Life Sciences Tools & Services,53.65B
1,AA,Alcoa,Metals & Mining,9.25B
2,AAC,Ares Acquisition,Shell Companies,1.22B
3,AACG,ATA Creativity Global,Diversified Consumer Services,90.35M
4,AADI,Aadi Bioscience,Pharmaceuticals,104.85M
...,...,...,...,...
5874,ZWRK,Z-Work Acquisition,Shell Companies,278.88M
5875,ZY,Zymergen,Chemicals,1.31B
5876,ZYME,Zymeworks,Biotechnology,1.50B
5877,ZYNE,Zynerba Pharmaceuticals,Pharmaceuticals,184.39M


### Extract Data for Patterns:

In [None]:
unique_symbols = df['Symbol'].unique() # create list of unique stock symbols
unique_companynames = df['CompanyName'].unique() # create list of unique company names
patterns = [] # create empty list of patterns
for Symbol in unique_symbols: # iterates over unique elements in list of symbols
    symbol_pattern = {"label":'SYMBOL', "pattern": [{"text": Symbol}]} # establishes pattern from the dataframe, the POS tag for each symbol is "SYMBOL"
    patterns.append(symbol_pattern) # adds first pattern to originally empty list
for CompanyName in unique_companynames: # iterates over unique elements in list of company names
    company_pattern = {"label":'ORG', "pattern": [{"text": word} for word in CompanyName.split()]} # establishes pattern from the dataframe, the POS tag for each company name is "ORG"
    patterns.append(company_pattern) # adds second pattern to patterns list

### Create an Entity Ruler:

In [None]:
company_ruler = nlp.add_pipe("entity_ruler", before="ner") # creates entity ruler which runs new patterns before established EntityRuler (such that stock symbols are reported as stock symbols, not ORGs)
company_ruler.add_patterns(patterns) # adds new patterns to EntityRuler

### Test the Entity Ruler:

In [None]:
text1 = "Helmerich & Payne (HP) saw its stock rise by 1.5%, fueled by optimistic forecasts in the Energy Equipment & Services sector. In contrast, Check-Cap (CHEK) faced a decline of 2.3% following its announcement of increased costs related to supply chain disruptions. Meanwhile, Vallon Pharmaceuticals (VLON) gained 0.8% after strong quarterly earnings, outperforming its peers in the Biotechnology space. Sequans Communications (SQNS) also recorded a modest increase of 0.5%, reflecting investors' confidence in its ability to navigate challenges in the Semiconductors & Semiconductor Equipment industry." # assigns text
doc1 = nlp(text1) # converts text to a spacy object Dic which can be analyzed for entities
doc1
for ent in doc1.ents:
    print(ent.text, ent.label_) # returns the label/POS tag for every entity, as well as all of the entities themselves

Helmerich & Payne ORG
HP SYMBOL
1.5% PERCENT
the Energy Equipment & Services ORG
Check-Cap PERSON
CHEK SYMBOL
2.3% PERCENT
Vallon Pharmaceuticals ORG
VLON SYMBOL
0.8% PERCENT
quarterly DATE
Biotechnology ORG
Sequans Communications ORG
SQNS SYMBOL
0.5% PERCENT
Semiconductors & Semiconductor Equipment ORG


In [None]:
# same process as above
text2 = "Aemetis (AMTX) saw its stock rise by 1.5%, fueled by optimistic forecasts in the Oil, Gas & Consumable Fuels sector. In contrast, Ferro Corporation (FOE) faced a decline of 2.3% following its announcement of increased costs related to supply chain disruptions. Meanwhile, RingCentral (RNG) gained 0.8% after strong quarterly earnings, outperforming its peers in the Software space. ACI Worldwide (ACIW) also recorded a modest increase of 0.5%, reflecting investors' confidence in its ability to navigate challenges in the Software industry."
doc2 = nlp(text2)
for ent in doc2.ents:
    print(ent.text, ent.label_)

Aemetis ORG
AMTX SYMBOL
1.5% PERCENT
the Oil, Gas & Consumable Fuels ORG
Ferro Corporation ORG
FOE SYMBOL
2.3% PERCENT
RingCentral ORG
RNG SYMBOL
0.8% PERCENT
quarterly DATE
Software ORG
ACI Worldwide ORG
ACIW SYMBOL
0.5% PERCENT
Software ORG


In [8]:
# same process as above
text3 = "On a mixed trading day, Par Pacific Holdings (PARR) saw its stock rise by 1.5%, fueled by optimistic forecasts in the Oil, Gas & Consumable Fuels sector. In contrast, Nano Dimension (NNDM) faced a decline of 2.3% following its announcement of increased costs related to supply chain disruptions. Meanwhile, Beyond Meat (BYND) gained 0.8% after strong quarterly earnings, outperforming its peers in the Food Products space. Apollo Investment (AINV) also recorded a modest increase of 0.5%, reflecting investors' confidence in its ability to navigate challenges in the Capital Markets industry."
doc3 = nlp(text3)
for ent in doc3.ents:
    print(ent.text, ent.label_)

Par Pacific Holdings ORG
PARR SYMBOL
1.5% PERCENT
the Oil, Gas & Consumable Fuels ORG
Nano Dimension ORG
NNDM SYMBOL
2.3% PERCENT
Beyond Meat ORG
BYND SYMBOL
0.8% PERCENT
quarterly DATE
Food Products ORG
Apollo Investment ORG
AINV SYMBOL
0.5% PERCENT
Capital Markets ORG
