In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

from gensim.models import Word2Vec, KeyedVectors

from sklearn.model_selection import train_test_split

In [2]:
no_trans_stem = pd.read_csv('../data/preproc_no_trans_stem.csv')
stem = pd.read_csv('../data/preproc_stem.csv')
long_cmts = pd.read_csv('../data/preproc_long.csv')

Build w2v models and save them

In [3]:
w2v_no_trans_stem = no_trans_stem.copy()
w2v_stem = stem.copy()
w2v_long = long_cmts.copy()
sentences_w2v_no_trans_stem = [[word for word in str(body).split()] for body in w2v_no_trans_stem.body]
sentences_w2v_stem = [[word for word in str(body).split()] for body in w2v_stem.body]
sentences_w2v_long = [[word for word in str(body).split()] for body in w2v_long.body]

In [4]:
seed = 42
min_word_count = 1
vector_size=300

In [17]:
word2vec = Word2Vec(sentences=sentences_w2v_no_trans_stem, seed=seed, min_count=2, size=vector_size)
word2vec.wv.most_similar('like')

[('his', 0.9999800324440002),
 ('at', 0.9999799728393555),
 ('have', 0.9999797344207764),
 ('time', 0.9999778866767883),
 ('what', 0.9999778270721436),
 ('got', 0.9999774098396301),
 ('make', 0.999977171421051),
 ('as', 0.9999769926071167),
 ('has', 0.9999761581420898),
 ('only', 0.9999754428863525)]

In [18]:
word2vec_stem = Word2Vec(sentences=sentences_w2v_stem, seed=seed, min_count=2, size=vector_size)
word2vec_stem.wv.most_similar('like')

[('make', 0.9999781250953674),
 ('not', 0.9999761581420898),
 ('it', 0.9999755620956421),
 ('no', 0.9999703168869019),
 ('know', 0.9999701976776123),
 ('would', 0.9999694228172302),
 ('coment', 0.9999693632125854),
 ('get', 0.9999691843986511),
 ('say', 0.9999687075614929),
 ('one', 0.9999677538871765)]

In [8]:
word2vec_long = Word2Vec(sentences=sentences_w2v_long, seed=seed, min_count=1, size=vector_size)
word2vec_long.wv.most_similar('love')

[('his', 0.9999637603759766),
 ('like', 0.9999631643295288),
 ('with', 0.9999629259109497),
 ('at', 0.9999586343765259),
 ('its', 0.9999579787254333),
 ('from', 0.9999533891677856),
 ('get', 0.999952495098114),
 ('about', 0.9999516010284424),
 ('even', 0.9999508857727051),
 ('can', 0.9999507069587708)]

In [15]:
# Save word vectors
word_vectors = word2vec.wv
word_vectors.save('../models/w2v/word2vec.wordvectors')
word_vectors_stem = word2vec_stem.wv
word_vectors_stem.save('../models/w2v/word2vec_stem.wordvectors')

In [9]:
word2vec_long = word2vec_long.wv
word2vec_long.save('../models/w2v/word2vec_long.wordvectors')

Clean up the dataset to be saved as training and testing datasets.

First, make sure the negative and positive comments are even in numbers.

In [11]:
no_trans_stem.rating.value_counts()

-1    3053
 0    1950
 1    1379
Name: rating, dtype: int64

In [10]:
long_cmts.rating.value_counts()

-1    1798
 0    1102
 1     712
Name: rating, dtype: int64

Drop neutral sentiment comments, split positive and negative into separate datasets.

In [15]:
no_trans_stem = no_trans_stem.loc[no_trans_stem.rating != -1]
stem = stem.loc[stem.rating != -1]
no_trans_stem_pos = no_trans_stem.loc[no_trans_stem.rating == 1]
no_trans_stem_neg = no_trans_stem.loc[no_trans_stem.rating == 0]
stem_pos = stem.loc[stem.rating == 1]
stem_neg = stem.loc[stem.rating == 0]

In [11]:
long_cmts = long_cmts.loc[long_cmts.rating != -1]
long_cmts_neg = long_cmts.loc[long_cmts.rating == 0]
long_cmts_pos = long_cmts.loc[long_cmts.rating == 1]

In [17]:
# Make sure everything is split correctly
print('No stem or trans pos length: %d, no stem or trans neg length: %d' % (len(no_trans_stem_pos), len(no_trans_stem_neg)))
print('Stem pos length: %d, stem neg length: %d' % (len(stem_pos), len(stem_neg)))

No stem or trans pos length: 1379, no stem or trans neg length: 1950
Stem pos length: 1379, stem neg length: 1950


In [18]:
#Write these datasets to file
no_trans_stem_pos.to_csv('../data/prepared/no_trans_stem_pos.csv', index=False)
no_trans_stem_neg.to_csv('../data/prepared/no_trans_stem_neg.csv', index=False)
stem_pos.to_csv('../data/prepared/stem_pos.csv', index=False)
stem_neg.to_csv('../data/prepared/stem_neg.csv', index=False)

In [12]:
long_cmts_pos.to_csv('../data/prepared/long_pos.csv', index=False)
long_cmts_neg.to_csv('../data/prepared/long_neg.csv', index=False)