In [56]:
import loadArticles
import loadStockInfo
import textProcessing

import en_core_web_lg

import urllib.parse
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import string

from spacy.gold import GoldParse
from spacy.pipeline import EntityRecognizer
from spacy.tokens import Doc
from spacy.pipeline import EntityRuler

In [3]:
#load stock info using meta and historical data
stockInfo = loadStockInfo.loadStockInfo('../../data/stock-market-dataset/stocks/', '../../data/stock-market-dataset/symbols_valid_meta.csv', DEBUG=True)
#stock_df = stockInfo.loadStockDf()
stock_meta_df = stockInfo.loadStockMetaDf()
#stock_meta_df = stock_meta_df[['Symbol', 'Security Name']]
stock_meta_df['Tag'] = 'ORG'

In [137]:
news_df = pd.read_csv('../../data/news/news_csv/news_2018_02.csv', index_col=0)

In [138]:
news_df

Unnamed: 0,title,published
0,"Fed Quarles pushes for rate hikes, review of c...",2018-02-23
1,Italy's League under pressure over racist shoo...,2018-02-05
2,BRIEF-HiQ International ‍Board Proposes Share ...,2018-02-21
3,Drake gives away almost $1 million in 'God's P...,2018-02-18
4,Stock market pre-markets data: Dow futures aft...,2018-02-13
...,...,...
63061,Salmon price rose to NOK 54.14 per kilo last w...,2018-02-21
63062,"Dollar Tree, Inc. to Host Fourth Quarter Earni...",2018-02-22
63063,Congress takes on the immigration issue amid e...,2018-02-13
63064,Dark times for workers in the U.S. solar industry,2018-02-09


In [136]:
stock_meta_df.query('Symbol != "A"')[['Tag', 'Symbol']]

Unnamed: 0,Tag,Symbol
1,ORG,AA
2,ORG,AAAU
3,ORG,AACG
4,ORG,AADR
5,ORG,AAL
...,...,...
8044,ORG,ZUO
8045,ORG,ZVO
8046,ORG,ZYME
8047,ORG,ZYNE


In [83]:
nlp = en_core_web_lg.load()
ruler = EntityRuler(nlp)

#creating pattersn for entity ruler
temp_df1 = stock_meta_df.query('Symbol != "A"')[['Tag', 'Symbol']]
temp_df1.columns = ['label', 'pattern']
temp_df2 = stock_meta_df[['Tag', 'Name']]
temp_df2.columns = ['label', 'pattern']

temp_df = pd.concat([temp_df1, temp_df2])

#set patterns
patterns = temp_df.to_dict('records')
#add patterns to ruler
ruler.add_patterns(patterns)
#add ruler to nlp
nlp.add_pipe(ruler)

tp = textProcessing.textProcessing(nlp)

In [105]:
news_df.iloc[110]['title']

'Atlantic Coast Financial Corporation’s Fourth Quarter 2017 Net Loss of $0.04 Per Diluted Share Included $0.14 in Charges Related to Tax Act and Pending Merger'

In [109]:
str(news_df.iloc[110]['title']).encode("ascii", "ignore").decode("ascii").translate(str.maketrans(' ', ' ', string.punctuation)).replace(' ', '+').replace('’','')

'Atlantic+Coast+Financial+Corporations+Fourth+Quarter+2017+Net+Loss+of+004+Per+Diluted+Share+Included+014+in+Charges+Related+to+Tax+Act+and+Pending+Merger'

In [139]:
###add one day to articles to ensure all stock changes were occurences after article
#get entities from article headline
#create a new news_df with columns = "ticker", "headline", "date"
news_tickr_dict = {'ticker': [], 'title': [], 'date': []}
news_date_array = [tuple(x) for x in news_df[['title', 'published']].values]

In [None]:
count = 0
for title, date in news_date_array:
    #get entities
    entities = tp.getEntitiesSpacy(nlp, title)
    table = {}
    count+=1
    print(count, end='\r')
    #gets ticker symbol from entity name in headline
    for entity in entities:
        if entity in list(stock_meta_df['Symbol']):
#             print(f"{title} - {entity}")
            news_tickr_dict['ticker'].append(entity)
            news_tickr_dict['title'].append(title)
            news_tickr_dict['date'].append(date)
        elif entity in list(stock_meta_df['Name']):
            company_symbol = stock_meta_df[stock_meta_df['Name'] == entity]['Symbol'].iloc[0]
#             print(f"{title} - {company_symbol}")
            news_tickr_dict['ticker'].append(company_symbol)
            news_tickr_dict['title'].append(title)
            news_tickr_dict['date'].append(date)
        else:
            entity = str(entity).encode("ascii", "ignore")\
                                .decode("ascii")\
                                .translate(str.maketrans(' ', ' ', string.punctuation))\
                                .replace(' ', '+')\
                                .replace('’','')
#             print(entity)
            market_watch_link = f'https://www.marketwatch.com/tools/quotes/lookup.asp?siteID=mktw&Lookup={entity}&Country=all&Type=Stock'

            req = Request(url=market_watch_link,headers={'user-agent': 'my-app/0.0.1'})
            response = urlopen(req)
            # Read the contents of the file into 'html'
            html = BeautifulSoup(response, 'html.parser')
            # Find 'results table' in the Soup and load it into 'symbol table'
            table = html.find("div", class_="results")

            #filter through web results to find symbol/company name
            if table != None:
                rows = table.table.findAll('td')
                if rows != None:
                    for row in rows[0]:
                        for td in row:
                            company_symbol = td
#                             print(f"{title} - {company_symbol}")
                            #add ticker, headline, and date to dict for our final news data set
                            #this will be used as our final news df which we will perform modeling on
                            news_tickr_dict['ticker'].append(company_symbol)
                            news_tickr_dict['title'].append(title)
                            news_tickr_dict['date'].append(date)
                            break



35205

In [131]:
news_tickr_df = pd.DataFrame(news_tickr_dict)

In [133]:
news_tickr_df.to_csv("/Users/jjackson/Brainstation/SpacyTickerNews2.csv")

In [132]:
news_tickr_df

Unnamed: 0,ticker,title,date
0,PSA,BRIEF-French carmaker PSA chooses Atlanta as n...,2018-01-24
1,OAS,U.S. House passes measure to fund government a...,2018-01-24
2,HKEB,BRIEF-U.S. CPSC says Western Gas Recalls To In...,2018-01-03
3,AMZN,German minister urges fast passage of EU law o...,2018-01-29
4,OBELF,BRIEF-Obsidian Energy reports FY 2017 producti...,2018-01-17
...,...,...,...
22388,MELI,Cramer's lightning round: Mercadolibre is even...,2018-01-10
22389,AAPL,Cramer's lightning round: Mercadolibre is even...,2018-01-10
22390,TFSL,"An aspirin a day, as Trump tells European corp...",2018-01-27
22391,GRMN,Garmin Ltd. schedules its fourth quarter 2017 ...,2018-01-18


In [None]:
#max 10k per request
#token_api = "udkw09gzwcqp3zffbhsrcgiqegnytwyda7hlxazy"
#f"https://stocknewsapi.com/api/v1?tickers={ticker}&items=10000&token={token_api}"
#news data by tickr
# for ticker in stock_meta_dfws_df['Symbol']:
