In [1]:
import spacy
import pandas as pd


In [3]:
df = pd.read_csv("stocks.tsv", delimiter="\t")

In [4]:
print(df.head())

  Symbol            CompanyName                        Industry MarketCap
0      A   Agilent Technologies  Life Sciences Tools & Services    53.65B
1     AA                  Alcoa                 Metals & Mining     9.25B
2    AAC       Ares Acquisition                 Shell Companies     1.22B
3   AACG  ATA Creativity Global   Diversified Consumer Services    90.35M
4   AADI        Aadi Bioscience                 Pharmaceuticals   104.85M


In [5]:
symbols = df.Symbol.tolist() # converte a coluna Symbol do dataframe df em uma lista chamada symbols
companies = df.CompanyName.tolist() # converte a coluna CompanyName do dataframe df em uma lista chamada companies
print(symbols[:10])

['A', 'AA', 'AAC', 'AACG', 'AADI', 'AAIC', 'AAL', 'AAMC', 'AAME', 'AAN']


In [42]:
# lista de palavras que devem ser ignoradas 
stops = ["two"]

nlp = spacy.blank("en")

ruler = nlp.add_pipe("entity_ruler") # adiciona um pipe de entity_ruler ao pipeline do spacy
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" # letras do alfabeto para gerar padroes de simbolos de acoes
patterns = [] # lista de padroes para o entity ruler

# itera sobre os simbolos de acoes
for symbol in symbols:
    patterns.append({"label":"STOCK", "pattern": symbol})   # adiciona o simbolo como padrao para o rotulo "stock"    
    for l in letters:   # adiciona variacoes do simbolo com letra
        patterns.append({"label":"STOCK", "pattern": symbol+f".{l}"})

# itera sobre as empresas
for company in companies:
    if company not in stops:  # se a empresa nao estiver na lista de stops, adiciona como padrao para o rotulo "company"
        patterns.append({"label":"COMPANY", "pattern": company})
        
for index in indexes: # itera sobre os indices
    patterns.append({"label":"INDEX", "pattern": index})  # adiciona o indice como padrao para o rotulo "index"
    
    words = index.split()
    patterns.append({"label":"INDEX", "pattern": "".join(words[:2])})  # divide o nome do indice em palavras e cria um padrao abreviado 
    
for index in index_symbols:
    patterns.append({"label":"INDEX", "pattern": index}) 

for e in exchanges:
    patterns.append({"label": "STOCK_EXCHANGE", "pattern": e})

# adiciona os padroes ao entity ruler
ruler.add_patterns(patterns)

In [43]:
doc = nlp(text)
for ent in doc.ents: # itera sobre as entidades reconhecidas no doc
    print(ent.text, ent.label)  # imprime o texto da entidade e seu rotulo

Apple 14413459108638621831
ET 3286642859329048944
Dow Jones Industrial Average 17936770861984895101


In [17]:
text = '''
Report ad
Apple shares were set for their worst single-day fall since May this year, weighing on the NA

Sentiment also took a hit from Cleveland Federal Reserve Bank President Loretta Mester's comm

Investors have paid keen attention to the labor market and data hinting towards higher infla

The S&P 500 has risen around 19% so far this year on support from dovish central bank politie

Report ad
The three main U.S. indexes got some support on Friday from news of a phone call between U.S.

**At 1:02 p.m. ET, the Dow Jones Industrial Average (B2I) was up 12.24 points, or 0.04%, ● 34**
'''


In [19]:
from spacy import displacy

In [29]:
df2 = pd.read_csv("indexes.tsv", sep="\t")
df2

Unnamed: 0,IndexName,IndexSymbol
0,Dow Jones Industrial Average,DJIA
1,Dow Jones Transportation Average,DJT
2,Dow Jones Utility Average Index,DJU
3,NASDAQ 100 Index (NASDAQ Calculation),NDX
4,NASDAQ Composite Index,COMP
5,NYSE Composite Index,NYA
6,S&P 500 Index,SPX
7,S&P 400 Mid Cap Index,MID
8,S&P 100 Index,OEX
9,NASDAQ Computer Index,IXCO


In [30]:
indexes = df2.IndexName.tolist() # converte a coluna IndexName do dataframe df2 em uma lista
index_symbols = df2.IndexSymbol.tolist() # converte a coluna IndexSymbol do dataframe df2 em uma lista

In [36]:
df3 = pd.read_csv("stock_exchanges.tsv", sep = "\t")
df3

Unnamed: 0,BloombergExchangeCode,BloombergCompositeCode,Country,Description,ISOMIC,Google Prefix,EODcode,NumStocks
0,AF,AR,Argentina,Bolsa de Comercio de Buenos Aires,XBUE,,BA,12
1,AO,AU,Australia,National Stock Exchange of Australia,XNEC,,,1
2,AT,AU,Australia,Asx - All Markets,XASX,ASX,AU,875
3,AV,,Austria,Wiener Boerse Ag,XWBO,VIE,VI,38
4,BI,,Bahrain,Bahrain Bourse,XBAH,,,4
...,...,...,...,...,...,...,...,...
97,UR,US,USA,NASDAQ Capital Market,XNCM,NASDAQ,US,2209
98,UV,US,USA,OTC markets,OOTC,OTCMKTS,US,2433
99,UW,US,USA,NASDAQ Global Select,XNGS,NASDAQ,US,1768
100,VH,VN,Vietnam,Hanoi Stock Exchange,HSTC,,,4


In [27]:
doc = nlp(text)
#for ent in doc.ents:
#    print(ent.text, ent.label_)
displacy.render(doc, style = "ent")