In [1]:
import sys
sys.path.insert(0, '../../scripts/')

In [2]:
# import required packages
import pandas as pd
import numpy as np

# word2vec
from gensim.models import Word2Vec

# local scripts
from word2vec_utils import fit_transform, transform

In [3]:
# read the dataset into a dataframe
df = pd.read_csv('../../data/train_data.csv')

display(df.shape)
df.head(10)

(3192, 2)

Unnamed: 0,clean_sentence,Sentiment
0,upm kymmene one world leading printing paper p...,positive
1,nokia pct eur kicking morning negative territory,positive
2,vasantha appointed managing director incap con...,neutral
3,consolidated net sale increased reach eur oper...,positive
4,cabot export production mainly goodyear bridge...,neutral
5,fda approves shire vyvanse binge eating disorder,positive
6,finnish steel maker rautaruukki oyj ruukki sai...,positive
7,adp news feb finnish wood product technology s...,negative
8,hull vessel built one block time ruukki delive...,neutral
9,finnish honkarakenne specialises building log ...,neutral


In [4]:
"""
Not training a new word2vec model.
A model has already been trained and is loaded for further usage.
Uncomment the code block to train and save a new model.
"""
"""
Not training a new word2vec model.
A model has already been trained and is loaded for further usage.
Uncomment the code block to train and save a new model.
"""
# params = {
#     'window': 5,
#     'min_count': 2,
#     'epochs': 5
# }

# trains a word2vec model
# builds vocabulary
# returns document_matrix and model_path
# document_matrix, model_path = fit_transform(
#     corpus=df.clean_tweet, 
#     model_save_path='../../models/word2vec.model', 
#     params=params
# )

# loads a pre-trained model
# returns document_matrix and model_path
document_matrix, _ = transform(
    corpus=df.clean_sentence, 
    model_load_path='../../models/word2vec.model',
)

document_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.10126,0.172969,0.074064,0.037569,0.042028,-0.270767,0.080114,0.420502,-0.107179,-0.086516,...,0.242757,0.009125,0.040415,0.059732,0.317194,0.08835,0.142855,-0.176983,0.111694,0.013735
1,-0.073112,0.133197,0.055148,0.031629,0.032111,-0.219228,0.06223,0.337676,-0.088739,-0.068926,...,0.189665,0.00997,0.037263,0.049762,0.25124,0.079446,0.102962,-0.143865,0.0864,0.010157
2,-0.079655,0.140546,0.05799,0.033934,0.03181,-0.217193,0.069451,0.338383,-0.087553,-0.06952,...,0.198599,0.007707,0.029304,0.051794,0.255313,0.07238,0.114715,-0.143702,0.092664,0.011519
3,-0.100543,0.177276,0.070339,0.049749,0.046601,-0.305025,0.080718,0.473022,-0.124815,-0.099016,...,0.24851,0.031143,0.062889,0.067259,0.353135,0.117301,0.137864,-0.187069,0.116288,0.022561
4,-0.067153,0.115755,0.05176,0.02575,0.026858,-0.184075,0.056593,0.280819,-0.073858,-0.058905,...,0.164316,0.006068,0.023441,0.042702,0.210729,0.060377,0.093277,-0.121066,0.076454,0.008403


In [5]:
# convert document matrix to dataframe
df_w2v = pd.DataFrame(document_matrix)

# include the class labels
df_w2v['Sentiment'] = df['Sentiment']

display(df_w2v.shape)
df_w2v.head()

(3192, 101)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Sentiment
0,-0.10126,0.172969,0.074064,0.037569,0.042028,-0.270767,0.080114,0.420502,-0.107179,-0.086516,...,0.009125,0.040415,0.059732,0.317194,0.08835,0.142855,-0.176983,0.111694,0.013735,positive
1,-0.073112,0.133197,0.055148,0.031629,0.032111,-0.219228,0.06223,0.337676,-0.088739,-0.068926,...,0.00997,0.037263,0.049762,0.25124,0.079446,0.102962,-0.143865,0.0864,0.010157,positive
2,-0.079655,0.140546,0.05799,0.033934,0.03181,-0.217193,0.069451,0.338383,-0.087553,-0.06952,...,0.007707,0.029304,0.051794,0.255313,0.07238,0.114715,-0.143702,0.092664,0.011519,neutral
3,-0.100543,0.177276,0.070339,0.049749,0.046601,-0.305025,0.080718,0.473022,-0.124815,-0.099016,...,0.031143,0.062889,0.067259,0.353135,0.117301,0.137864,-0.187069,0.116288,0.022561,positive
4,-0.067153,0.115755,0.05176,0.02575,0.026858,-0.184075,0.056593,0.280819,-0.073858,-0.058905,...,0.006068,0.023441,0.042702,0.210729,0.060377,0.093277,-0.121066,0.076454,0.008403,neutral


In [6]:
# load a trained model
model = Word2Vec.load('../../models/word2vec.model')

In [7]:
# top 10 similar words
# similar to bully
display(model.wv.most_similar('profit'))

[('eur', 0.998862624168396),
 ('loss', 0.9987896084785461),
 ('million', 0.998157799243927),
 ('net', 0.998129665851593),
 ('operating', 0.9979076385498047),
 ('increased', 0.9978724122047424),
 ('quarter', 0.9977854490280151),
 ('year', 0.9974496960639954),
 ('sale', 0.9974429607391357),
 ('month', 0.9974063634872437)]

In [8]:
# word-word cosine similarity
display(model.wv.similarity(w1='profit', w2='gain'))

0.98727024