In [1]:
# import required packages
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [2]:
# read the dataset into a dataframe
df = pd.read_csv('../../data/train_data.csv')

display(df.shape)
df.head(10)

(3192, 2)

Unnamed: 0,clean_sentence,Sentiment
0,upm kymmene one world leading printing paper p...,positive
1,nokia pct eur kicking morning negative territory,positive
2,vasantha appointed managing director incap con...,neutral
3,consolidated net sale increased reach eur oper...,positive
4,cabot export production mainly goodyear bridge...,neutral
5,fda approves shire vyvanse binge eating disorder,positive
6,finnish steel maker rautaruukki oyj ruukki sai...,positive
7,adp news feb finnish wood product technology s...,negative
8,hull vessel built one block time ruukki delive...,neutral
9,finnish honkarakenne specialises building log ...,neutral


In [3]:
# check if null value exist
display(df.clean_sentence.isna().sum())

# replace null values with empty string
df.clean_sentence = df.clean_sentence.fillna('')

# verify null count
display(df.clean_sentence.isna().sum())

1

0

In [4]:
# required tokenization for word2vec
documents = df.clean_sentence.apply(simple_preprocess)

documents.head()

0    [upm, kymmene, one, world, leading, printing, ...
1    [nokia, pct, eur, kicking, morning, negative, ...
2    [vasantha, appointed, managing, director, inca...
3    [consolidated, net, sale, increased, reach, eu...
4    [cabot, export, production, mainly, goodyear, ...
Name: clean_sentence, dtype: object

In [5]:
"""
Not training a new word2vec model.
A model has already been trained and is loaded for further usage.
Uncomment the code block to train and save a new model.
"""
# spawn a Word2Vec model
# model = Word2Vec(window=5, min_count=2)

# build vocabulary from entire corpus
# model.build_vocab(documents, progress_per=1000)

# train the word2vec
# model.train(documents, total_examples=model.corpus_count, epochs=5)

# save the model
# commented to avoid overwriting the trained model
# model.save("../../models/word2vec.model")

'\nNot training a new word2vec model.\nA model has already been trained and is loaded for further usage.\nUncomment the code block to train and save a new model.\n'

In [6]:
# load a trained model
model = Word2Vec.load('../../models/word2vec.model')

In [7]:
# top 10 similar words
# similar to bully
display(model.wv.most_similar('profit'))

[('eur', 0.998862624168396),
 ('loss', 0.9987896084785461),
 ('million', 0.998157799243927),
 ('net', 0.998129665851593),
 ('operating', 0.9979076385498047),
 ('increased', 0.9978724122047424),
 ('quarter', 0.9977854490280151),
 ('year', 0.9974496960639954),
 ('sale', 0.9974429607391357),
 ('month', 0.9974063634872437)]

In [8]:
# word-word cosine similarity
display(model.wv.similarity(w1='profit', w2='gain'))

0.98727024

In [9]:
# extract word vectors as dataframe from the model
word_vectors = pd.DataFrame([model.wv.get_vector(str(word)) for word in model.wv.key_to_index], index = model.wv.key_to_index)

display(word_vectors.shape)
word_vectors.sample(10)

(3186, 100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
mantsala,-0.015794,0.017637,0.007671,0.011494,-0.001648,-0.026776,0.015311,0.025298,-0.005326,-0.001846,...,0.02312,-0.006935,0.003342,0.010825,0.029248,0.005043,0.005346,-0.016286,0.00309,-0.003218
screen,-0.016986,0.018031,0.000932,0.011995,0.004516,-0.021051,0.002164,0.048008,-0.021322,-0.006077,...,0.031535,0.001377,0.009949,0.005034,0.033273,0.017974,0.020873,-0.015996,0.016979,0.00747
measuring,-0.004863,0.009617,0.004656,0.011068,0.008382,-0.021237,0.004668,0.032921,-0.004472,-0.012808,...,0.017222,0.00548,0.007321,0.008493,0.019082,-0.001638,0.016307,-0.009158,0.016771,-0.003126
expected,-0.097205,0.148615,0.062857,0.025083,0.046159,-0.245581,0.082161,0.397245,-0.110705,-0.073909,...,0.235024,0.012977,0.040881,0.06636,0.302885,0.086808,0.136858,-0.177197,0.103731,0.012756
specifically,-0.013974,0.016097,0.015584,-0.003975,0.000648,-0.017581,0.007325,0.023398,0.000613,0.003461,...,0.009365,-0.005331,-0.005398,0.007548,0.019318,0.016395,0.01899,-0.014134,0.012016,0.006978
tlt,-0.003588,0.020789,0.00361,0.014107,0.00235,-0.023696,-0.000541,0.047737,-0.02137,-0.008221,...,0.036743,-0.000141,0.008362,0.009087,0.043661,0.011346,0.013402,-0.014335,0.01804,0.008549
outlook,-0.025036,0.063326,0.034943,0.008164,0.01016,-0.098788,0.036943,0.136104,-0.043229,-0.030258,...,0.075426,0.006746,0.020053,0.027484,0.116551,0.036942,0.054022,-0.054625,0.04073,-0.001606
moving,-0.016511,0.038745,0.015445,0.003045,0.006737,-0.043718,0.018577,0.073715,-0.028718,-0.012567,...,0.034471,0.004796,0.002423,0.016812,0.062271,0.023025,0.018625,-0.023659,0.021878,-0.005266
increase,-0.091697,0.15896,0.062048,0.038095,0.035873,-0.237811,0.071574,0.363563,-0.088199,-0.068967,...,0.215729,0.008719,0.036048,0.050466,0.277032,0.080083,0.116564,-0.162681,0.109095,0.018082
english,-3.7e-05,0.001624,0.006403,0.008952,0.007907,-0.02026,0.00503,0.01728,0.000336,0.002444,...,0.023725,0.000672,0.009424,-0.001804,0.016027,0.000485,-0.000322,-0.016322,0.002016,-0.003507


In [10]:
# generate document matrix from word vectors
document_matrix = []

for document in documents:
    document_vector = np.zeros(100)
    for word in document:
        if word in model.wv.index_to_key:
            document_vector += model.wv[word]
    document_vector = document_vector if len(document)==0 else (document_vector/len(document))
    document_matrix.append(document_vector)
    
len(document_matrix), len(document_matrix[0])

(3192, 100)

In [11]:
# convert document matrix to dataframe
df_w2v = pd.DataFrame(document_matrix)

# include the class labels
df_w2v['Sentiment'] = df['Sentiment']

display(df_w2v.shape)
df_w2v.head()

(3192, 101)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Sentiment
0,-0.10126,0.172969,0.074064,0.037569,0.042028,-0.270767,0.080114,0.420502,-0.107179,-0.086516,...,0.009125,0.040415,0.059732,0.317194,0.08835,0.142855,-0.176983,0.111694,0.013735,positive
1,-0.073112,0.133197,0.055148,0.031629,0.032111,-0.219228,0.06223,0.337676,-0.088739,-0.068926,...,0.00997,0.037263,0.049762,0.25124,0.079446,0.102962,-0.143865,0.0864,0.010157,positive
2,-0.079655,0.140546,0.05799,0.033934,0.03181,-0.217193,0.069451,0.338383,-0.087553,-0.06952,...,0.007707,0.029304,0.051794,0.255313,0.07238,0.114715,-0.143702,0.092664,0.011519,neutral
3,-0.100543,0.177276,0.070339,0.049749,0.046601,-0.305025,0.080718,0.473022,-0.124815,-0.099016,...,0.031143,0.062889,0.067259,0.353135,0.117301,0.137864,-0.187069,0.116288,0.022561,positive
4,-0.067153,0.115755,0.05176,0.02575,0.026858,-0.184075,0.056593,0.280819,-0.073858,-0.058905,...,0.006068,0.023441,0.042702,0.210729,0.060377,0.093277,-0.121066,0.076454,0.008403,neutral
