In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

from gensim.models import Word2Vec, KeyedVectors

from sklearn.model_selection import train_test_split

In [2]:
no_trans_stem = pd.read_csv('../data/preproc_no_trans_stem.csv')
stem = pd.read_csv('../data/preproc_stem.csv')
long_cmts = pd.read_csv('../data/preproc_long.csv')

Build w2v models and save them

In [3]:
w2v_no_trans_stem = no_trans_stem.copy()
w2v_stem = stem.copy()
w2v_long = long_cmts.copy()
sentences_w2v_no_trans_stem = [[word for word in str(body).split()] for body in w2v_no_trans_stem.body]
sentences_w2v_stem = [[word for word in str(body).split()] for body in w2v_stem.body]
sentences_w2v_long = [[word for word in str(body).split()] for body in w2v_long.body]

In [4]:
seed = 42
min_word_count = 1
vector_size=300

In [5]:
word2vec = Word2Vec(sentences=sentences_w2v_no_trans_stem, seed=seed, min_count=1, size=vector_size)
word2vec.wv.most_similar('love')

[('her', 0.9999401569366455),
 ('song', 0.9999393224716187),
 ('selena', 0.9999318718910217),
 ('videos', 0.9999314546585083),
 ('amazing', 0.9999299049377441),
 ('beautiful', 0.9999279975891113),
 ('so', 0.9999265074729919),
 ('actualy', 0.9999256134033203),
 ('when', 0.9999215602874756),
 ('funy', 0.9999215006828308)]

In [8]:
word2vec_stem = Word2Vec(sentences=sentences_w2v_stem, seed=seed, min_count=1, size=vector_size)
word2vec_stem.wv.most_similar('love')

[('song', 0.9999301433563232),
 ('amaz', 0.9999276399612427),
 ('god', 0.999923050403595),
 ('her', 0.9999221563339233),
 ('so', 0.9999215006828308),
 ('funi', 0.9999207258224487),
 ('much', 0.9999203085899353),
 ('selena', 0.999920129776001),
 ('beauti', 0.9999179840087891),
 ('great', 0.999914824962616)]

In [7]:
word2vec_long = Word2Vec(sentences=sentences_w2v_long, seed=seed, min_count=1, size=vector_size)
word2vec_long.wv.most_similar('love')

[('song', 0.9999653100967407),
 ('now', 0.9999630451202393),
 ('people', 0.9999624490737915),
 ('because', 0.9999600648880005),
 ('when', 0.9999589920043945),
 ('am', 0.9999585747718811),
 ('to', 0.9999572038650513),
 ('guy', 0.9999569654464722),
 ('about', 0.9999555349349976),
 ('cant', 0.9999550580978394)]

In [9]:
# Save word vectors
word_vectors = word2vec.wv
word_vectors.save('../models/w2v/word2vec.wordvectors')
word_vectors_stem = word2vec_stem.wv
word_vectors_stem.save('../models/w2v/word2vec_stem.wordvectors')
word2vec_long = word2vec_long.wv
word2vec_long.save('../models/w2v/word2vec_long.wordvectors')

Clean up the dataset to be saved as training and testing datasets.

First, make sure the negative and positive comments are even in numbers.

In [10]:
no_trans_stem.rating.value_counts()

-1    6706
 0    4349
 1    3152
Name: rating, dtype: int64

In [14]:
long_cmts.rating.value_counts()

-1    3315
 0    1889
 1    1236
Name: rating, dtype: int64

Drop neutral sentiment comments, split positive and negative into separate datasets.

In [11]:
no_trans_stem = no_trans_stem.loc[no_trans_stem.rating != -1]
stem = stem.loc[stem.rating != -1]
no_trans_stem_pos = no_trans_stem.loc[no_trans_stem.rating == 1]
no_trans_stem_neg = no_trans_stem.loc[no_trans_stem.rating == 0]
stem_pos = stem.loc[stem.rating == 1]
stem_neg = stem.loc[stem.rating == 0]
long_cmts = long_cmts.loc[long_cmts.rating != -1]
long_cmts_neg = long_cmts.loc[long_cmts.rating == 0]
long_cmts_pos = long_cmts.loc[long_cmts.rating == 1]

In [12]:
# Make sure everything is split correctly
print('No stem or trans pos length: %d, no stem or trans neg length: %d' % (len(no_trans_stem_pos), len(no_trans_stem_neg)))
print('Stem pos length: %d, stem neg length: %d' % (len(stem_pos), len(stem_neg)))

No stem or trans pos length: 3152, no stem or trans neg length: 4349
Stem pos length: 3152, stem neg length: 4349


In [13]:
#Write these datasets to file
no_trans_stem_pos.to_csv('../data/prepared/no_trans_stem_pos.csv', index=False)
no_trans_stem_neg.to_csv('../data/prepared/no_trans_stem_neg.csv', index=False)
stem_pos.to_csv('../data/prepared/stem_pos.csv', index=False)
stem_neg.to_csv('../data/prepared/stem_neg.csv', index=False)
long_cmts_pos.to_csv('../data/prepared/long_pos.csv', index=False)
long_cmts_neg.to_csv('../data/prepared/long_neg.csv', index=False)