In [1]:
#Import dependencies
!pip install spacy
!pip install gensim
!python -m spacy download en_core_web_md
import numpy as np
import pandas as pd
import spacy
import re



[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [5]:
reuters_data = pd.read_csv('news_reuters.csv').dropna() # load Reuters stock news data as Pandas dataframe

# we are only interested in the stocks that have the most news data so that our classifier can have a good corpus of
# training data to learn from
top15_tickers = list(reuters_data["Ticker"].value_counts()[:15].index)
filtered_data = reuters_data[reuters_data["Ticker"].isin(top15_tickers)].reset_index().copy()
filtered_data

Unnamed: 0,index,Ticker,Name,Date,Headline,Tagline,Rating
0,1074,AAPL,1-800 FLOWERSCOM Inc,20140414,Apple antitrust compliance off to a promising ...,"NEW YORK Apple Inc has made a ""promising start...",topStory
1,1075,AAPL,1-800 FLOWERSCOM Inc,20140414,Apple antitrust compliance off to a promising ...,"NEW YORK April 14 Apple Inc has made a ""promi...",normal
2,1076,AAPL,1-800 FLOWERSCOM Inc,20140414,COLUMN-How to avoid the trouble coming to the ...,(The opinions expressed here are those of the ...,normal
3,1077,AAPL,1-800 FLOWERSCOM Inc,20140414,How to avoid the trouble coming to the tech se...,CHICAGO A resounding shot across the bow has b...,normal
4,1078,AAPL,1-800 FLOWERSCOM Inc,20140415,Apple cannot escape U.S. states' e-book antitr...,NEW YORK Apple Inc on Tuesday lost an attempt ...,normal
...,...,...,...,...,...,...,...
50787,184859,TAPR,Barclays Inverse US Treasury Composite ETN,20170209,BRIEF-Ultra Petroleum says Barclays agreed to ...,* Ultra Petroleum- on Feb 8 in connection wit...,normal
50788,184860,TAPR,Barclays Inverse US Treasury Composite ETN,20170209,MOVES-Barclays Nasdaq RenCap AXA BC Partners,Feb 9 The following financial services industr...,topStory
50789,184861,TAPR,Barclays Inverse US Treasury Composite ETN,20170217,Barclays Citi gave South Africa watchdog info...,JOHANNESBURG Feb 17 Barclays Plc and Citigrou...,normal
50790,184862,TAPR,Barclays Inverse US Treasury Composite ETN,20170217,Barclays Citi helped South Africa with forex ...,JOHANNESBURG Barclays Plc and Citigroup appr...,topStory


In [6]:
# get unique words from all taglines
corpus = list(reuters_data["Tagline"])
split_corpus = [re.split("\W+", c) for c in corpus]
words = set()
occurences = {}
for c in split_corpus:
    for k in c:
        w = k.lower()
        words.add(w)
        if occurences.get(w, None):
            occurences[w] += 1
        else:
            occurences[w] = 1
words = pd.Series(list(words))

# compute inverse document frequency for each word
idfs = {}
for word in words:
    idfs[word] = np.log(len(corpus) / occurences[word])
    
# train Word2Vec model on our corpus
import gensim.models

class iter_corpus:
    """An iterator that yields sentences from the corpus. """
    def __init__(self, corpus):
        self.corpus = []
        for tag in corpus:
            sentences = re.split("\.", tag)
            for s in sentences:
                tokens = re.split("\W+", s)
                self.corpus.append(tokens)
    def __iter__(self):
        for sentence in self.corpus:
            yield sentence

sentences = iter_corpus(corpus)
model = gensim.models.Word2Vec(sentences=sentences, size=64, min_count=1)

In [7]:
k_embeddings = []
for idx, row in filtered_data.iterrows():
    tag = row["Tagline"]
    k = np.zeros(64)
    norm_factor = 1
    words = re.split("\W+", tag)
    freq = {} # dictionary for frequency of each word in the tagline
    for word in words:
        if freq.get(word, None):
            freq[word] += 1
        else:
            freq[word] = 1
    for word in words:
        if (word in model.wv.vocab) and (idfs.get(word, None) != None):
            tf = np.log(1 + freq[word]) # term frequency
            idf = idfs[word] # inverse document frequency
            gamma = tf * idf # gamma = TF-IDF score
            k += gamma * model.wv[word]
            norm_factor += gamma
    k /= norm_factor
    k_embeddings.append(k)
for r in range(64):
    filtered_data["K{}".format(r)] = pd.Series([k[r] for k in k_embeddings])

In [8]:
nlp = spacy.load("en_core_web_md")

In [9]:
v_embeddings = []
for idx, row in filtered_data.iterrows():
    tag = row["Tagline"]
    v_embeddings.append(nlp(tag).vector)
for r in range(300):
    filtered_data["V{}".format(r)] = pd.Series([v[r] for v in v_embeddings])

In [10]:
filtered_data

Unnamed: 0,index,Ticker,Name,Date,Headline,Tagline,Rating,K0,K1,K2,...,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299
0,1074,AAPL,1-800 FLOWERSCOM Inc,20140414,Apple antitrust compliance off to a promising ...,"NEW YORK Apple Inc has made a ""promising start...",topStory,0.728133,0.074376,-0.844244,...,-0.184006,0.032116,0.032128,-0.045440,0.027079,-0.100620,0.032597,-0.092093,0.048542,0.109286
1,1075,AAPL,1-800 FLOWERSCOM Inc,20140414,Apple antitrust compliance off to a promising ...,"NEW YORK April 14 Apple Inc has made a ""promi...",normal,0.757790,0.111567,-0.802569,...,-0.168789,0.039603,0.021292,-0.036883,0.029685,-0.110353,0.025347,-0.084554,0.045670,0.105747
2,1076,AAPL,1-800 FLOWERSCOM Inc,20140414,COLUMN-How to avoid the trouble coming to the ...,(The opinions expressed here are those of the ...,normal,-0.624152,-0.346050,-1.487509,...,-0.141506,-0.027039,-0.080825,-0.133556,0.018669,-0.056828,-0.052640,-0.169819,-0.033054,0.053817
3,1077,AAPL,1-800 FLOWERSCOM Inc,20140414,How to avoid the trouble coming to the tech se...,CHICAGO A resounding shot across the bow has b...,normal,0.387120,-0.099557,-0.590867,...,-0.233473,0.095700,0.113241,-0.027537,-0.119434,-0.074786,-0.072007,-0.049933,0.014863,0.063664
4,1078,AAPL,1-800 FLOWERSCOM Inc,20140415,Apple cannot escape U.S. states' e-book antitr...,NEW YORK Apple Inc on Tuesday lost an attempt ...,normal,0.824634,-1.637257,-0.352775,...,-0.232241,0.027836,-0.025965,0.036613,-0.087056,-0.103006,0.076729,-0.153311,0.038894,0.138866
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50787,184859,TAPR,Barclays Inverse US Treasury Composite ETN,20170209,BRIEF-Ultra Petroleum says Barclays agreed to ...,* Ultra Petroleum- on Feb 8 in connection wit...,normal,1.139437,0.682006,0.029171,...,-0.244216,0.053853,-0.008725,-0.048169,-0.032766,-0.062842,-0.059161,-0.104091,0.010547,0.129130
50788,184860,TAPR,Barclays Inverse US Treasury Composite ETN,20170209,MOVES-Barclays Nasdaq RenCap AXA BC Partners,Feb 9 The following financial services industr...,topStory,1.017802,-0.165982,-0.467275,...,-0.234947,0.049924,0.064670,0.022008,0.025572,-0.144732,-0.046366,-0.030195,-0.027131,0.093039
50789,184861,TAPR,Barclays Inverse US Treasury Composite ETN,20170217,Barclays Citi gave South Africa watchdog info...,JOHANNESBURG Feb 17 Barclays Plc and Citigrou...,normal,1.044449,-0.042930,0.201579,...,-0.234416,-0.001098,-0.035648,-0.053637,0.030076,-0.037331,0.048593,-0.019262,-0.030251,0.178724
50790,184862,TAPR,Barclays Inverse US Treasury Composite ETN,20170217,Barclays Citi helped South Africa with forex ...,JOHANNESBURG Barclays Plc and Citigroup appr...,topStory,1.288937,-0.372697,0.197727,...,-0.247672,0.049712,0.028656,-0.078167,0.047243,0.061589,0.016127,-0.073754,-0.011532,0.154577


In [11]:
filtered_data.to_csv("embeddings.csv", index=False)