In [56]:
import loadArticles
import loadStockInfo
import textProcessing

import en_core_web_lg

import urllib.parse
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import string

from spacy.gold import GoldParse
from spacy.pipeline import EntityRecognizer
from spacy.tokens import Doc
from spacy.pipeline import EntityRuler

In [3]:
#load stock info using meta and historical data
stockInfo = loadStockInfo.loadStockInfo('../../data/stock-market-dataset/stocks/', '../../data/stock-market-dataset/symbols_valid_meta.csv', DEBUG=True)
#stock_df = stockInfo.loadStockDf()
stock_meta_df = stockInfo.loadStockMetaDf()
#stock_meta_df = stock_meta_df[['Symbol', 'Security Name']]
stock_meta_df['Tag'] = 'ORG'

In [80]:
news_df = pd.read_csv('../../data/news/news_csv/news_2018_01.csv', index_col=0, nrows=10000)

In [81]:
news_df.head()

Unnamed: 0,title,published
0,EMERGING MARKETS-Mexican peso seesaws over dol...,2018-01-27
1,"Migrants must visit Nazi concentration camps, ...",2018-01-11
2,Euro zone businesses start 2018 on decade high,2018-01-25
3,Russia's Lavrov says 'unilateral actions' by U...,2018-01-22
4,Lawmakers to Justice Department: Keep online g...,2018-01-13


In [82]:
stock_meta_df.query('Symbol != "A"')[['Tag', 'Symbol']]

Unnamed: 0,Tag,Symbol
1,ORG,AA
2,ORG,AAAU
3,ORG,AACG
4,ORG,AADR
5,ORG,AAL
...,...,...
8044,ORG,ZUO
8045,ORG,ZVO
8046,ORG,ZYME
8047,ORG,ZYNE


In [83]:
nlp = en_core_web_lg.load()
ruler = EntityRuler(nlp)

#creating pattersn for entity ruler
temp_df1 = stock_meta_df.query('Symbol != "A"')[['Tag', 'Symbol']]
temp_df1.columns = ['label', 'pattern']
temp_df2 = stock_meta_df[['Tag', 'Name']]
temp_df2.columns = ['label', 'pattern']

temp_df = pd.concat([temp_df1, temp_df2])

#set patterns
patterns = temp_df.to_dict('records')
#add patterns to ruler
ruler.add_patterns(patterns)
#add ruler to nlp
nlp.add_pipe(ruler)

tp = textProcessing.textProcessing(nlp)

In [105]:
news_df.iloc[110]['title']

'Atlantic Coast Financial Corporation’s Fourth Quarter 2017 Net Loss of $0.04 Per Diluted Share Included $0.14 in Charges Related to Tax Act and Pending Merger'

In [109]:
str(news_df.iloc[110]['title']).encode("ascii", "ignore").decode("ascii").translate(str.maketrans(' ', ' ', string.punctuation)).replace(' ', '+').replace('’','')

'Atlantic+Coast+Financial+Corporations+Fourth+Quarter+2017+Net+Loss+of+004+Per+Diluted+Share+Included+014+in+Charges+Related+to+Tax+Act+and+Pending+Merger'

In [84]:
###add one day to articles to ensure all stock changes were occurences after article
#get entities from article headline
#create a new news_df with columns = "ticker", "headline", "date"
news_tickr_dict = {'ticker': [], 'title': [], 'date': []}
news_date_array = [tuple(x) for x in news_df[['title', 'published']].values]

In [110]:
count = 0
for title, date in news_date_array:
    #get entities
    entities = tp.getEntitiesSpacy(nlp, title)
    table = {}
    count+=1
    print(count, end='\r')
    #gets ticker symbol from entity name in headline
    for entity in entities:
        if entity in list(stock_meta_df['Symbol']):
#             print(f"{title} - {entity}")
            news_tickr_dict['ticker'].append(entity)
            news_tickr_dict['title'].append(title)
            news_tickr_dict['date'].append(date)
        elif entity in list(stock_meta_df['Name']):
            company_symbol = stock_meta_df[stock_meta_df['Name'] == entity]['Symbol'].iloc[0]
#             print(f"{title} - {company_symbol}")
            news_tickr_dict['ticker'].append(company_symbol)
            news_tickr_dict['title'].append(title)
            news_tickr_dict['date'].append(date)
        else:
            entity = str(entity).encode("ascii", "ignore")\
                                .decode("ascii")\
                                .translate(str.maketrans(' ', ' ', string.punctuation))\
                                .replace(' ', '+')\
                                .replace('’','')
#             print(entity)
            market_watch_link = f'https://www.marketwatch.com/tools/quotes/lookup.asp?siteID=mktw&Lookup={entity}&Country=all&Type=Stock'

            req = Request(url=market_watch_link,headers={'user-agent': 'my-app/0.0.1'})
            response = urlopen(req)
            # Read the contents of the file into 'html'
            html = BeautifulSoup(response, 'html.parser')
            # Find 'results table' in the Soup and load it into 'symbol table'
            table = html.find("div", class_="results")

            #filter through web results to find symbol/company name
            if table != None:
                rows = table.table.findAll('td')
                if rows != None:
                    for row in rows[0]:
                        for td in row:
                            company_symbol = td
#                             print(f"{title} - {company_symbol}")
                            #add ticker, headline, and date to dict for our final news data set
                            #this will be used as our final news df which we will perform modeling on
                            news_tickr_dict['ticker'].append(company_symbol)
                            news_tickr_dict['title'].append(title)
                            news_tickr_dict['date'].append(date)
                            break



Justice+Department
BioRad+Gains+Additional
US+FDA+Clearances+for+Blood+Typing+Products
Levi++Korsinsky
TDM+Bhd+Updates
Alacrity+Energy+Names
Capgen+Capital+Group
MS
Strategy++Business+Development
MIG+Unmobi+Technology
FY
Twitter
Freedonia+Analyst+Weighs
Itochos+Acquisition
Alta+Forest+Products
Bain+Capital
Chase+Bank
DKK
Franklin+Electric
US+Groundwater+Distribution+Company
WeissLaw+LLP+Investigates+Archrock+Partners
Vandeweghe
Group
Thoma+Bravo+Acquires
Motus
Runzheimer
Goldman+Sachs
2nd+Circuit
Tryg
Tesla
Alcoa+Corp
WealthEngine+Board
US+Senates
New+York+Times
WordPress+Products+Pioneer+iThemes
Liquid+Web
Canadian+Society+of+Professionals+in+Disability+Management
Casino
Novo+Nordisk
Sturgeon
Rio+Tinto
Novartis
Pfizer
Siliconware+Precision+Industries+Reports+Unaudited+Consolidated+Financial+Results
Shenzhen+Crave+Communication
Cargill
SCOTUS
Sears+Holdings
Kmart
Sears
Tongyu+Communication
MVC+Capital+Reports
Amazon
Monteverde++Associates
Announces+An+Investigation+Of+Forum+Merger+Corp

KeyboardInterrupt: 

In [111]:
news_tickr_df = pd.DataFrame(news_tickr_dict)

Unnamed: 0,ticker,title,date
0,OSIS,"EQUITY ALERT: Levi & Korsinsky, LLP Reminds Sh...",2018-01-03
1,MA,M&S held back by dowdy online look,2018-01-12
2,ERI,ERI Names Anna Marie Francello Senior Director...,2018-01-31
3,FB,BRIEF-MIG Unmobi Technology sees FY 2017 net p...,2018-01-31
4,TWTR,How long airlines take to respond to your comp...,2018-01-10
...,...,...,...
3362,CAN,ENGLAND CRICKETER BEN STOKES CAN NOW BE CONSID...,2018-01-18
3363,NOW,ENGLAND CRICKETER BEN STOKES CAN NOW BE CONSID...,2018-01-18
3364,BE,ENGLAND CRICKETER BEN STOKES CAN NOW BE CONSID...,2018-01-18
3365,FOR,ENGLAND CRICKETER BEN STOKES CAN NOW BE CONSID...,2018-01-18


In [112]:
news_tickr_df.query('ticker == "FB"') 

Unnamed: 0,ticker,title,date
3,FB,BRIEF-MIG Unmobi Technology sees FY 2017 net p...,2018-01-31
16,FB,BRIEF-MIG Unmobi Technology sees FY 2017 net p...,2018-01-31
63,FB,BRIEF-MIG Unmobi Technology sees FY 2017 net p...,2018-01-31
110,FB,BRIEF-MIG Unmobi Technology sees FY 2017 net p...,2018-01-31
157,FB,BRIEF-MIG Unmobi Technology sees FY 2017 net p...,2018-01-31
...,...,...,...
3164,FB,BRIEF-Aier Eye Hospital Group sees FY 2017 net...,2018-02-01
3204,FB,UPDATE 1-EU court rejects Facebook class actio...,2018-01-26
3225,FB,Facebook CEO Mark Zuckerberg's Clueless Local ...,2018-01-31
3262,FB,BRIEF-Shenzhen Tianyuan Dic Information Techno...,2018-01-24
