In [26]:
import pandas as pd
import spacy
from spacy.lang.en import English

In [27]:
df_wsb = pd.read_csv('r_wsb.csv')

In [28]:
for i in df_wsb['title']:
    print(i)

RH sent me a BS email back about the support ticket I sent them a while ago
TBT 6/18 20c
Start of commodity super cycle - $MT, $FCX, $XOM and many others now boarding the 🚀
WSBGod - Searching For The Truth
$UBS lazy Saturday morning half-ass DD
⚡️TSLA GANG ⚡️Double Bottom &amp; Possible Breakout BTFD 🚀 ☀️
WEEDSTOCKS WENT BURR
Wall Street Week Ahead for the trading week beginning February 15th, 2021
NOT EVERY PLAY IS A SHORT SQUEEZE PLAY 🚀🚀 🦍 🦍 🌕🌕
It wasn't just RH, all brokerage firms did the same thing.
The stock market is a pendulum
Let's talk about LoanDepot (LDi) and it's potential upside movement in the next few weeks.
DD: Cloudflare (NET) is going to continue its strong outperformance. Buy the dip
GME; when diamonds break (missing another 25k loss expired last week)
COTY dd - Makeup Never Dies
Most Anticipated Earnings Releases for the week beginning February 15th, 2021
You might consider entering $ATVI
AMD and Intel DD
Summary of all $BB (BlackBerry) fundamentals/news
UWMC: a po

In [29]:
# Add only the entity recognizer to the new blank pipeline
nlp = spacy.blank("en")
ruler = nlp.create_pipe('entity_ruler')
nlp.add_pipe(ruler)

patterns = [{'label':'ORG','id':'BB','pattern':'BB'},
           {'label':'ORG','id':'BB','pattern':'$BB'},
           {'label':'ORG','id':'BB','pattern':'Blackberry'},
           {'label':'ORG','id':'GME','pattern':'GME'},
           {'label':'ORG','id':'GME','pattern':'$GME'},
            {'label':'ORG','id':'GME','pattern':[{'lower':'gme'}]},
           {'label':'ORG','id':'GME','pattern':[{'lower':'gamestop'}]},
           {'label':'ORG','id':'GME','pattern':[{'lower':'game'},{'lower':'stop'}]},
           {'label':'ORG','id':'RTX','pattern':[{'lower':'raytheon'}]},
           {'label':'ORG','id':'NIO','pattern':'NIO'},
           {'label':'ORG','id':'AAPL','pattern':'AAPL'},
           {'label':'ORG','id':'BP','pattern':'BP'},
           {'label':'ORG','id':'BEAM','pattern':'BEAM'},
           {'label':'ORG','id':'POWW','pattern':'POWW'},
           {'label':'ORG','id':'POWW','pattern':[{'lower':'ammo munitions'}]},
           {'label':'ORG','id':'SPY','pattern':'SPY'},
           {'label':'ORG','id':'QQQ','pattern':'QQQ'},
           {'label':'ORG','id':'BB','pattern':'BB'},
           {'label':'ORG','id':'ATVI','pattern':[{'lower':'activision'}]},
           {'label':'ORG','id':'AMC','pattern':'AMC'},
           {'label':'ORG','id':'AMC','pattern':'$AMC'},
           {'label':'PERSON','id':'Elon_Musk','pattern':[{'lower':'elon musk'}]},
           {'label':'PERSON','id':'Elon_Musk','pattern':[{'lower':'elon'}]},
            {'label':'ORG','id':'Robinhood','pattern':[{'lower':'robinhood'}]},
           {'label':'ORG','id':'Melvin_Capital','pattern':[{'lower':'melvin'}]},
           {'label':'ORG','id':'Melvin_Capital','pattern':[{'lower':'melvin capital',}]}]
                     
ruler.add_patterns(patterns)

In [30]:
df_out_wsb = pd.DataFrame()

for item in df_wsb['title']:
    df_out_wsb = df_out_wsb.append([[item,[(ent.text, ent.label_, ent.ent_id_) for ent in nlp(item).ents]]],ignore_index=True)

In [31]:
doc = nlp(str(list(df_wsb['title'])))
set([ent.ent_id_ for ent in doc.ents])

{'AMC', 'BB', 'GME', 'Melvin_Capital', 'Robinhood'}

In [32]:
# Automating NER
# The idea : we can use the manually annotated examples to train the model to generalise
train_data = []

for i in df_wsb['title']:
    doc = nlp(i)
    if doc.ents == ():# or 'PERSON' in [ent.label_ for ent in doc.ents]:
        continue
    else:
        train_data.append((doc.text, {'entities':[(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}))

In [33]:
train_data

[('GME; when diamonds break (missing another 25k loss expired last week)',
  {'entities': [(0, 3, 'ORG')]}),
 ('Summary of all $BB (BlackBerry) fundamentals/news',
  {'entities': [(15, 18, 'ORG')]}),
 ('Here’s to all the Karen’s in the world. GME 20K to 1.35 million.',
  {'entities': [(40, 43, 'ORG')]}),
 ('Autist Rich Asian: Ape bought $10M of GME near its peak ($8M Loss)',
  {'entities': [(38, 41, 'ORG')]}),
 ('A Good Point About the Squeeze and GME', {'entities': [(35, 38, 'ORG')]}),
 ('GME gains! Still holding my shares invested 10 thousand into GME calls, sold them and bought shares. Still holding🙌🏽💎🚀🚀',
  {'entities': [(0, 3, 'ORG'), (61, 64, 'ORG')]}),
 ('“What to do if you lost money on GameStop, AMC, or other ‘meme stocks’ “',
  {'entities': [(33, 41, 'ORG'), (43, 46, 'ORG')]}),
 ('DFV vs Melvin and Robinhood live',
  {'entities': [(7, 13, 'ORG'), (18, 27, 'ORG')]}),
 ('Why no GME daily?', {'entities': [(7, 10, 'ORG')]}),
 ('People thinking there going to find the next GME and

In [34]:
# Define blank model
nlp_new = spacy.blank("en")
nlp_new.add_pipe(nlp.create_pipe("ner"))
optimizer = nlp_new.begin_training()

ner = nlp_new.get_pipe("ner")

In [35]:
# Add labels to the ner
for text, annotations in train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp_new.pipe_names if pipe not in pipe_exceptions]

In [36]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path

# Training the model
with nlp_new.disable_pipes(*unaffected_pipes):

  # Training for 100 iterations
  for iteration in range(100):

    # shuffling examples  before every iteration
    random.shuffle(train_data)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp_new.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    sgd=optimizer,
                    drop=0.1,  # dropout - make it harder to memorise data
                    losses=losses,
                )
    if iteration % 5 == 0:
        print("Losses", losses)

Losses {'ner': 160.5536470413208}
Losses {'ner': 10.942262220514143}
Losses {'ner': 3.0904119430763113}
Losses {'ner': 10.914622277471315}
Losses {'ner': 5.462289057420412}
Losses {'ner': 6.6673328495690924}
Losses {'ner': 2.622239871258939}
Losses {'ner': 3.7714145323942754}
Losses {'ner': 5.585490543630312}
Losses {'ner': 3.9166674931373255}
Losses {'ner': 4.834412276791437}
Losses {'ner': 6.250329318020694}
Losses {'ner': 5.916722903445263}
Losses {'ner': 9.156419811223525}
Losses {'ner': 4.762985384347395}
Losses {'ner': 5.750000292260883}
Losses {'ner': 2.0518293408628687}
Losses {'ner': 3.166668680058284}
Losses {'ner': 2.000040887850652}
Losses {'ner': 4.282306153280359}


In [37]:
# Testing the model
doc = nlp_new("A new retail stock trading app called Robinhood was launched")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Robinhood', 'ORG')]


In [38]:
# Check for one entity that is not in corpus
doc = nlp_new('BMW; when diamonds break (missing another 25k loss expired last week)')
print("Entities", [(ruler.matcher.vocab.strings[ent.ent_id], ent.label_, ent.text) for ent in doc.ents])

Entities [('', 'ORG', 'BMW')]
