## Import packages

In [27]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

from bs4 import BeautifulSoup
import requests
import re

## Preprocessing

In [24]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [36]:
doc = nlp(ex)
displacy.render(doc, style="ent", jupyter=True)

### Web scrap an article

In [31]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, "html.parser")
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [32]:
url = 'https://finance.yahoo.com/news/stock-market-news-today-dow-leads-stocks-higher-as-oil-falls-bitcoin-soars-210825601.html?guccounter=1&guce_referrer=aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS8&guce_referrer_sig=AQAAALCe78ZHxIilUnxojf7gZl7Y2qpdNip8aqeZuLD8eeYMkggap8Ep8Fij12xUi9Op8vSWAbYG4IwbqDBGsW7DeXKVvVcaAjITMPlE8q8MGbj5uOxxEMqlo7jDHOhyfsrDN8MYJKiAfBRiY_mwF2fu8m8F6FePbEecKhMSqUYehp7G'
ny_bb = url_to_string(url)
article = nlp(ny_bb)
len(article.ents)

358

In [34]:
labels = [(x.text, x.label_) for x in article.ents]
Counter(labels)

Counter({('today', 'DATE'): 3,
         ('Dow', 'ORG'): 5,
         ('YF Chartbook Calendars Trending Tickers Stocks', 'ORG'): 1,
         ('Losers Top ETFs', 'PERSON'): 1,
         ('Highest Implied Volatility US Treasury Bonds Rates Currency Converter           News             Latest News From',
          'ORG'): 1,
         ('the Newsroom Stock Market News Earnings Politics Economic News Morning Brief Personal Finance News Crypto News Bidenomics Report Card           Videos             Yahoo Finance Invest Yahoo Finance Live ETF Report',
          'ORG'): 1,
         ('Rating Screener', 'PERSON'): 1,
         ('Smart Money Screener', 'PERSON'): 1,
         ('AdvertisementU.S.', 'GPE'): 1,
         ('EUR', 'ORG'): 1,
         ('-0.29%', 'PERCENT'): 1,
         ('10', 'CARDINAL'): 1,
         ('Bitcoin USD37,547.93+2,296.52', 'ORG'): 1,
         ('200779.38+36.86', 'PERCENT'): 1,
         ('FTSE', 'ORG'): 1,
         ('1007,486.91+46.44', 'DATE'): 1,
         ('Yahoo FinanceStock', '

In [25]:
doc = nlp(ex)
print([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'), ('Google', 'ORG'), ('$5.1 billion', 'MONEY'), ('Wednesday', 'DATE')]


## Custom Train Finance Model

In [39]:
# pip install spacy[tranformers]

import json

#importing annotated data for training
with open('data.json', 'r') as f:
    data = json.load(f)

In [None]:
training_data = {'classes' : ['STOCK', "ORG", "PRODUCT", "DIRECTION"], 'annotations' : []}
for example in data['examples']:
  temp_dict = {}
  temp_dict['text'] = example['content']
  temp_dict['entities'] = []
  for annotation in example['annotations']:
    start = annotation['start']
    end = annotation['end']
    label = annotation['tag_name'].upper()
    temp_dict['entities'].append((start, end, label))
  training_data['annotations'].append(temp_dict)

In [None]:
from spacy.tokens import DocBin
from spacy.util import filter_spans
from tqdm import tqdm

nlp = spacy.blank('en')
doc_bin = DocBin()

for training_example  in tqdm(training_data['annotations']): 
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("training_data.spacy") # save the docbin object

In [None]:
# python -m spacy init fill-config base_config.cfg config.cfg
# python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy --gpu-id 0

## Load new model

In [None]:
nlp_ner = spacy.load("model-best")
text = ''
doc = nlp_ner(text)
labels = [(x.text, x.label_) for x in doc.ents]

In [None]:
## Next steps
### 1. store the values and labels in a database / dataframe
### 2. Can be used for trading bot