In [1]:
# import required packages
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [2]:
# read the dataset into a dataframe
df = pd.read_csv('../../data/train_data.csv')

display(df.shape)
df.head(10)

(3192, 2)

Unnamed: 0,clean_sentence,Sentiment
0,upm kymmene one world leading printing paper p...,positive
1,nokia pct eur kicking morning negative territory,positive
2,vasantha appointed managing director incap con...,neutral
3,consolidated net sale increased reach eur oper...,positive
4,cabot export production mainly goodyear bridge...,neutral
5,fda approves shire vyvanse binge eating disorder,positive
6,finnish steel maker rautaruukki oyj ruukki sai...,positive
7,adp news feb finnish wood product technology s...,negative
8,hull vessel built one block time ruukki delive...,neutral
9,finnish honkarakenne specialises building log ...,neutral


In [3]:
# check if null value exist
display(df.clean_sentence.isna().sum())

# replace null values with empty string
df.clean_sentence = df.clean_sentence.fillna('')

# verify null count
display(df.clean_sentence.isna().sum())

1

0

In [4]:
# required tokenization for word2vec
documents = df.clean_sentence.apply(simple_preprocess)

documents.head()

0    [upm, kymmene, one, world, leading, printing, ...
1    [nokia, pct, eur, kicking, morning, negative, ...
2    [vasantha, appointed, managing, director, inca...
3    [consolidated, net, sale, increased, reach, eu...
4    [cabot, export, production, mainly, goodyear, ...
Name: clean_sentence, dtype: object

In [5]:
"""
Not training a new word2vec model.
A model has already been trained and is loaded for further usage.
Uncomment the code block to train and save a new model.
"""
# spawn a Word2Vec model
# model = Word2Vec(window=5, min_count=2)

# build vocabulary from entire corpus
# model.build_vocab(documents, progress_per=1000)

# train the word2vec
# model.train(documents, total_examples=model.corpus_count, epochs=5)

# save the model
# commented to avoid overwriting the trained model
# model.save("../../models/word2vec.model")

'\nNot training a new word2vec model.\nA model has already been trained and is loaded for further usage.\nUncomment the code block to train and save a new model.\n'

In [6]:
# load a trained model
model = Word2Vec.load('../../models/word2vec.model')

In [7]:
# top 10 similar words
# similar to bully
display(model.wv.most_similar('profit'))

[('eur', 0.998862624168396),
 ('loss', 0.9987896084785461),
 ('million', 0.998157799243927),
 ('net', 0.998129665851593),
 ('operating', 0.9979076385498047),
 ('increased', 0.9978724122047424),
 ('quarter', 0.9977854490280151),
 ('year', 0.9974496960639954),
 ('sale', 0.9974429607391357),
 ('month', 0.9974063634872437)]

In [8]:
# word-word cosine similarity
display(model.wv.similarity(w1='profit', w2='gain'))

0.98727024

In [9]:
# extract word vectors as dataframe from the model
word_vectors = pd.DataFrame([model.wv.get_vector(str(word)) for word in model.wv.key_to_index], index = model.wv.key_to_index)

display(word_vectors.shape)
word_vectors.sample(10)

(3186, 100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
gsk,-0.010071,0.022815,0.009439,0.007796,-0.004843,-0.022126,-0.000993,0.040134,-0.009193,-0.006243,...,0.021184,-0.003762,-0.00706,0.007509,0.034266,0.000926,0.016545,-0.014248,0.004616,-0.008358
early,-0.027013,0.069369,0.026519,0.016134,0.005391,-0.105572,0.038272,0.15703,-0.032962,-0.024896,...,0.096685,0.0008,0.006442,0.027645,0.119627,0.023392,0.042866,-0.068658,0.035295,0.011612
art,-0.016386,0.045725,0.013237,0.014702,0.018767,-0.064087,0.021166,0.100411,-0.028165,-0.019488,...,0.067366,0.001592,0.008261,0.020727,0.078392,0.025389,0.030231,-0.045647,0.016934,0.005974
presentation,-0.010004,0.009608,0.003109,0.003611,0.005936,-0.018002,-0.00592,0.011968,-0.010418,0.000565,...,0.011392,-0.000382,0.007544,0.011926,0.013851,0.005325,0.010757,-0.011546,0.000367,-0.003759
must,-0.024617,0.03218,0.009936,-0.004354,0.001581,-0.04202,0.010864,0.053996,-0.024891,-0.023227,...,0.039781,0.007804,0.010137,0.017333,0.050631,0.017311,0.030486,-0.018036,0.01395,-0.007909
redundant,-0.01531,0.009656,0.00555,0.005277,-0.004854,-0.008852,0.003747,0.018988,-0.009624,-0.00193,...,0.019981,0.001689,0.004451,0.010457,0.012715,0.000691,0.010179,-0.010089,0.013337,0.000728
pretty,-0.021993,0.024187,0.000147,0.014722,0.01424,-0.036651,0.004283,0.057256,-0.009124,-0.009376,...,0.041898,0.00264,0.010316,0.012684,0.038229,0.015301,0.017615,-0.013905,0.015863,7.8e-05
mount,-0.009407,0.009285,0.009509,0.001625,0.008588,-0.03024,0.013875,0.034528,-0.010262,-0.008561,...,0.024257,-0.00944,-0.002376,0.012854,0.037366,0.002238,0.007534,-0.01275,0.003724,0.000928
post,-0.025109,0.056296,0.016795,0.016873,0.016569,-0.080116,0.030177,0.125217,-0.036432,-0.032682,...,0.081443,0.005814,0.006943,0.020618,0.097194,0.028978,0.040613,-0.047917,0.028365,0.005913
online,-0.050004,0.067124,0.027224,0.011831,0.019347,-0.1248,0.041331,0.186721,-0.052309,-0.045463,...,0.098572,-0.003775,0.019467,0.027906,0.134389,0.035711,0.050215,-0.079615,0.048379,0.007386


In [10]:
# generate document vectors from word vectors
document_vectors = []
words = set(model.wv.index_to_key)

for document in documents:
    document_vector = np.zeros(100)
    for word in document:
        if word in model.wv.index_to_key:
            document_vector += model.wv[word]
    document_vector = document_vector if len(document)==0 else (document_vector/len(document))
    document_vectors.append(document_vector)
    
len(document_vectors), len(document_vectors[0])

(3192, 100)

In [11]:
# convert document vectors to document matrix as dataframe
document_matrix = pd.DataFrame(document_vectors)

# include the class labels
document_matrix['Sentiment'] = df['Sentiment']

display(document_matrix.shape)
document_matrix.head()

(3192, 101)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Sentiment
0,-0.10126,0.172969,0.074064,0.037569,0.042028,-0.270767,0.080114,0.420502,-0.107179,-0.086516,...,0.009125,0.040415,0.059732,0.317194,0.08835,0.142855,-0.176983,0.111694,0.013735,positive
1,-0.073112,0.133197,0.055148,0.031629,0.032111,-0.219228,0.06223,0.337676,-0.088739,-0.068926,...,0.00997,0.037263,0.049762,0.25124,0.079446,0.102962,-0.143865,0.0864,0.010157,positive
2,-0.079655,0.140546,0.05799,0.033934,0.03181,-0.217193,0.069451,0.338383,-0.087553,-0.06952,...,0.007707,0.029304,0.051794,0.255313,0.07238,0.114715,-0.143702,0.092664,0.011519,neutral
3,-0.100543,0.177276,0.070339,0.049749,0.046601,-0.305025,0.080718,0.473022,-0.124815,-0.099016,...,0.031143,0.062889,0.067259,0.353135,0.117301,0.137864,-0.187069,0.116288,0.022561,positive
4,-0.067153,0.115755,0.05176,0.02575,0.026858,-0.184075,0.056593,0.280819,-0.073858,-0.058905,...,0.006068,0.023441,0.042702,0.210729,0.060377,0.093277,-0.121066,0.076454,0.008403,neutral
