In [99]:
import numpy as np
import pandas as pd
import multiprocessing
import gensim
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import doc2vec
from collections import namedtuple
from sklearn.metrics import accuracy_score

In [100]:
# utf 8 encoding error
data = pd.read_csv("adhoc_news_clean.txt", sep='\n', header=None, encoding='latin-1')
data.head()

Unnamed: 0,0
0,fjh ag initi assess annual result short discon...
1,grenkeleas growth acceler badenbaden januari g...
2,us toptier electron manufactur servic compani ...
3,tele stock repurchas berlin januari th board t...
4,januari hansabrunnen ag affili compani holsten...


In [102]:
new_cols = ['news']
data.columns = new_cols

In [103]:
# list of news
doc_compilation = []
for doc in data["news"]:
    doc_compilation.append(doc)

In [104]:
doc_compilation[:2]
#len(doc_compilation)

['fjh ag initi assess annual result short discontinu investig carri depart public prosecut execut board consult softwar compani fjh ag list tecdax provid initi preliminari assess financi year follow first three success quarter fourth quarter characteris special factor whole annual net incom expect amount just million euro anonym charg respons parti fjh octob well various press articl publish connect led appar restraint regard placement new extens exist order discontinu investig just christma effect fourth quarter around half loss sale anticip fourth quarter can attribut fact addit insur custom fjh also face consider uncertainti fourth quarter fiscal treatment life assur fiscal treatment compani much tax privileg differ type insur therefor caution longerterm invest describ month report continu unab final compani saw stabilis two difficult project thus last safeguard longterm custom relat execut board confid cours compani will return usual earn strength appropri measur increas sale cost 

In [105]:
# Doc2Vec requires input as LabeledLineSentence/TaggedDocument format
# one news = one doc = one sentence
# basically it is a list of list of words and a tag for that document
# [words = ['word1', 'word2'], tags = [sent1]]
def process_tagged_document(corpus):
    for i, line in enumerate(corpus):
        yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line, deacc=True), [i])
total_corpus = list(process_tagged_document(doc_compilation))    
#chec the first five news
total_corpus[:5]

[TaggedDocument(words=['fjh', 'ag', 'initi', 'assess', 'annual', 'result', 'short', 'discontinu', 'investig', 'carri', 'depart', 'public', 'prosecut', 'execut', 'board', 'consult', 'softwar', 'compani', 'fjh', 'ag', 'list', 'tecdax', 'provid', 'initi', 'preliminari', 'assess', 'financi', 'year', 'follow', 'first', 'three', 'success', 'quarter', 'fourth', 'quarter', 'characteris', 'special', 'factor', 'whole', 'annual', 'net', 'incom', 'expect', 'amount', 'just', 'million', 'euro', 'anonym', 'charg', 'respons', 'parti', 'fjh', 'octob', 'well', 'various', 'press', 'articl', 'publish', 'connect', 'led', 'appar', 'restraint', 'regard', 'placement', 'new', 'extens', 'exist', 'order', 'discontinu', 'investig', 'just', 'christma', 'effect', 'fourth', 'quarter', 'around', 'half', 'loss', 'sale', 'anticip', 'fourth', 'quarter', 'can', 'attribut', 'fact', 'addit', 'insur', 'custom', 'fjh', 'also', 'face', 'consider', 'uncertainti', 'fourth', 'quarter', 'fiscal', 'treatment', 'life', 'assur', 'fi

In [None]:
# Training the model using gensim Doc2Vec
# First instantiate a Doc2Vec Object with specific parameters
# min_count: ignore all words with total frequency lower than this
# size is the dimensionality of the feature vectors (rule of thumb: 100 to 400)
# window is the maximum distance between the predicted word and context words used for prediction within a document
# dm defines the training algorithm. By default (dm=1), ‘distributed memory’ (PV-DM) is used. 
# Otherwise, distributed bag of words (PV-DBOW) is employed.
# sample = threshold for configuring which higher-frequency words are randomly downsampled; default is 0 (off), useful value is 1e-5.
# workers = use this many worker threads to train the model (=faster training with multicore machines).
# class gensim.models.doc2vec.Doc2Vec(documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, **kwargs)
# negative = if > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20).

In [121]:
# DOC2VEC DISTRIBUTED MEMORY
# MODEL PARAMETERS
# BUILD MODEL
#model_dm = gensim.models.Doc2Vec(min_count=1, window=6, size=400, sample=1e-5, workers=10, iter=20)
#model_dm = gensim.models.Doc2Vec(min_count=1, window=6, size=300, sample=1e-5, workers=10, iter=20)
model_dm = gensim.models.Doc2Vec(min_count=1, window=6, size=100, sample=1e-5, workers=10, iter=20)

# Build a vocab which is a dictionary of all unique words extracted from training corpus
model_dm.build_vocab(total_corpus)

# Train the model through many times of iteration
%time model_dm.train(total_corpus, total_examples=model_dm.corpus_count, epochs=model_dm.iter)

CPU times: user 54.6 s, sys: 9.92 s, total: 1min 4s
Wall time: 35.2 s


11218583

In [122]:
# DOC2VEC DISTRIBUTED BAG OF WORDS
# MODEL PARAMETERS
# BUILD MODEL

# Try with another training algorithm: distributed bag of word (dbow)
#model_dbow = gensim.models.Doc2Vec(dm=0,min_count=1, window=6, size=400, sample=1e-5, workers=10, iter=20)
#model_dbow = gensim.models.Doc2Vec(dm=0,min_count=1, window=6, size=300, sample=1e-5, workers=10, iter=20)
model_dbow = gensim.models.Doc2Vec(dm=0,min_count=1, window=6, size=100, sample=1e-5, workers=10, iter=20)

model_dbow.build_vocab(total_corpus)
%time model_dbow.train(total_corpus, total_examples=model_dbow.corpus_count, epochs=model_dbow.iter)

CPU times: user 35.7 s, sys: 1.84 s, total: 37.5 s
Wall time: 22.9 s


11215935

In [110]:
#inspect a document vector
docvec = model_dm.docvecs[100] 
docvec # is an array 
# all the vectors can be obtained through doctag_syn0

array([ -6.81249201e-02,   2.75937393e-02,  -1.04013525e-01,
         7.59607330e-02,  -2.73406450e-02,  -4.75983396e-02,
        -2.11353358e-02,  -1.61165401e-04,   5.57710649e-03,
        -3.21738236e-02,   9.28568617e-02,   1.97754288e-03,
         7.70305246e-02,   1.46699831e-01,  -5.38911894e-02,
         1.00226544e-01,   6.32965565e-02,  -2.08594576e-02,
        -2.41112970e-02,   5.83501626e-03,   4.46207002e-02,
        -8.86843354e-03,   2.89630461e-02,   6.82228431e-02,
         7.26604164e-02,   4.86460552e-02,  -4.46660891e-02,
         3.99297588e-02,   2.67922096e-02,  -1.64842624e-02,
        -7.86316991e-02,   1.87371597e-02,  -2.03581993e-02,
        -2.50798333e-02,  -1.21068947e-01,  -4.86787334e-02,
         4.77416217e-02,   7.74352327e-02,  -2.37030089e-02,
        -6.80414885e-02,   1.31282344e-01,  -3.27400453e-02,
        -5.92813157e-02,   9.62169899e-04,   1.37665328e-02,
         6.56080693e-02,  -2.53424142e-02,   2.44390834e-02,
         5.57942055e-02,

In [117]:
#get the doc vectors, turn to a dataframe, save as csv as input for ML in R
docvecs_dm = model_dm.docvecs.doctag_syn0
docvecs_dm = pd.DataFrame(docvecs_dm)
#docvecs_dm.to_csv('docvecs_dm_400.csv', index=False, header=True)
docvecs_dm.to_csv('docvecs_dm_300.csv', index=False, header=True)
#docvecs_dm.to_csv('docvecs_dm_100.csv', index=False, header=True)

In [118]:
# get the doc vectors for model dbow
docvecs_dbow = model_dbow.docvecs.doctag_syn0
docvecs_dbow = pd.DataFrame(docvecs_dbow)
#docvecs_dbow.to_csv('docvecs_dbow_400.csv', index=False, header=True)
docvecs_dbow.to_csv('docvecs_dbow_300.csv', index=False, header=True)
#docvecs_dbow.to_csv('docvecs_dbow_100.csv', index=False, header=True)

In [119]:
# it seems model_dm works better at correlate the words after training than dbow
model_dm.most_similar("financi")

#[('statement', 0.8362114429473877),
# ('provision', 0.8314566016197205),
# ('nonaudit', 0.8252514004707336),
# ('deprec', 0.8143759965896606),
# ('ifr', 0.8109675645828247),
# ('instead', 0.8071068525314331),
# ('restat', 0.8049291968345642),
# ('reclassif', 0.7967160940170288),
# ('reaffirm', 0.7944886684417725),
# ('writeback', 0.789879322052002)]

#model["finance"].shape
#model_dm.most_similar("posit")

[('statement', 0.8601853251457214),
 ('goingconcern', 0.8091734051704407),
 ('nonaudit', 0.8051964044570923),
 ('reclassif', 0.7942148447036743),
 ('instead', 0.7937629818916321),
 ('provision', 0.7876889705657959),
 ('explan', 0.7843255996704102),
 ('forego', 0.7796642780303955),
 ('semiannu', 0.7795006632804871),
 ('report', 0.7789796590805054)]

In [120]:
model_dbow.most_similar("financi")
#[('securitis', 0.19116026163101196),
# ('coemiss', 0.18409697711467743),
# ('clinic', 0.1803305745124817),
# ('zermatt', 0.17824828624725342),
# ('shelv', 0.1733556091785431),
# ('immunoassay', 0.17233437299728394),
# ('strike', 0.16815799474716187),
# ('brfollow', 0.16751345992088318),
# ('ttelborn', 0.16725698113441467),
# ('contentvi', 0.16470666229724884)]

[('clinic', 0.21650616824626923),
 ('microcomput', 0.2135980725288391),
 ('hesh', 0.2109202742576599),
 ('spotmarket', 0.20826569199562073),
 ('glacier', 0.20794934034347534),
 ('altmann', 0.2068473994731903),
 ('kazakhstan', 0.20075266063213348),
 ('industriebank', 0.20024338364601135),
 ('sshv', 0.19853392243385315),
 ('netprofit', 0.19554542005062103)]

In [43]:
#load_data = pd.read_csv("docvecs_dm_300.csv")
load_data = pd.read_csv("docvecs_dbow_300.csv")
load_data.head()
load_data.shape

(13135, 400)

In [44]:
# Split data to training and testing 
X_train = load_data[0:9000]
X_test = load_data[9001:]

y = pd.read_csv("return_direction.csv")
y_train = y["x"][:9000]

y_test = y["x"][9001:]


0    Negative
1    Negative
2    Positive
3    Positive
4    Negative
Name: x, dtype: object

In [45]:
# Fit a random forest to the training data, using 100 trees
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier( n_estimators = 200 )

print ("Fitting a random forest to labeled training data...")
forest = forest.fit(X_train, y_train)

# Test & extract results 
result = forest.predict(X_test)

accuracy_score(y_test, result)

# Write the test results 
#output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
#output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3 )

Fitting a random forest to labeled training data...


array(['Positive', 'Positive', 'Positive', 'Positive', 'Positive'], dtype=object)

0.53386550556361878

In [47]:
# Fit a SVM to the training data
from sklearn import svm
SVM = svm.SVC()

print ("Fitting a SVM to labeled training data...")
svm = SVM.fit(X_train, y_train)

# Test & extract results 
result = svm.predict(X_test)

accuracy_score(y_test, result)

Fitting a SVM to labeled training data...


0.54063860667634256