In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

import functions as f



## Functions

word -> text -> text_corpus
token -> text_token -> tokens_corpus
ind -> text_ind -> ind_corpus
corpus_counter

text: string ze słowami -> word

text_token: lista z tokenami -> token

text_ind: lista z indexami tokenów -> ind

text_corpus: lista stringów ze słowami

tokens_corpus: lista list z tokenami

corpus_counter: słownik ze słowami i częstotliwościami

### Text preprocessing

Creating word tokens

In [2]:
df = pd.read_csv('data/reviews_toys_games.csv')
df.sample(5)

Unnamed: 0,review,sentiment
81660,Awesome snowball maker! A must-have for snow p...,1
822458,Perfect for a child and free on-line lessons w...,1
543013,I love this product because my granddaughter l...,1
925083,ok for the money,1
1019005,"In a crowded field of electronic talking toys,...",1


In [3]:
sum(df['sentiment'])/len(df)

0.914674872933972

In [4]:
df_sample = df.sample(100000, random_state=11).reset_index(drop=True)
sum(df_sample['sentiment'])/len(df_sample)

0.9148

In [5]:
reviews = df_sample['review'].to_list()
sentiment = df_sample['sentiment'].to_list()

#reviews = df['review'][:10000].to_list()
#sentiment = df['sentiment'][:10000].to_list()

In [6]:
reviews_tokens = f.normalize_text(reviews)

In [7]:
corpus_counter, n_tokens = f.distinct_corpus_words(reviews_tokens)

print('Words in the dictionary: ', n_tokens)
print('Most common words: ', corpus_counter[:10])

Words in the dictionary:  45629
Most common words:  [('love', 42733), ('great', 26982), ('play', 20313), ('toy', 16773), ('old', 16037), ('like', 15440), ('buy', 14097), ('kid', 13786), ('game', 13429), ('fun', 13182)]


In [35]:
#with open('data/reviews_tokens_10k.pickle', 'wb') as handle:
#    pickle.dump(reviews_tokens, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
#with open('data/sentiment_10k.pickle', 'wb') as handle:
#    pickle.dump(sentiment, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [76]:
#df_sample.to_csv('data/reviews_toys_games_100k.csv', index=False)

In [None]:
df = None
reviews = None
reviews_tokens = None
sentiment = None

## Creating word vectors

In [2]:
with open('data/reviews_tokens_10k.pickle', 'rb') as handle:
    reviews_tokens = pickle.load(handle)

In [8]:
corpus_counter, n_tokens = f.distinct_corpus_words(reviews_tokens)
word2ind, ind2word = f.build_dictionary(corpus_counter)
reviews_ind = [f.text_token2ind(review, word2ind) for review in reviews_tokens]

In [11]:
with open('data/reviews_ind_100k.pickle', 'wb') as handle:
    pickle.dump(reviews_ind, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Bag of words

In [37]:
S_bow = f.build_bow(reviews_ind, n_tokens, 6000)

In [38]:
S_bow.shape

(100000, 6000)

In [49]:
ind_test = 111
print('text of test review: ', reviews_tokens[ind_test])
print('occurrences of words:',
      '\nfigure:', S_bow.todense()[ind_test, word2ind['figure']],
      '\nlove:', S_bow.todense()[ind_test, word2ind['love']])

text of test review:  ['great', 'figure', 'right', 'price', 'love', 'amaze', 'figure', 'see']
occurrences of words: 
figure: 2 
love: 1


In [50]:
with open('data/M_bow_10k.pickle', 'wb') as handle:
    pickle.dump(S_bow, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
S_bow = None

#### Convert string to BOW vector

In [51]:
review_test = 'This game is amazing ^^, my son plays with it all the time! popolsku behavoir'

In [52]:
review_tokens_test = f.normalize_single_text(review_test)
print(review_tokens_test)

review_ind_test = f.text_token2ind(review_tokens_test, word2ind)
print(review_ind_test)

['game', 'amaze', 'son', 'play', 'time', 'popolsku', 'behavoir']
[8, 171, 20, 2, 18]


In [53]:
f.build_single_bow(review_ind_test, n_tokens, 6000)

array([0., 0., 1., ..., 0., 0., 0.])

## TFIDF

In [77]:
vectorizer = TfidfVectorizer(max_features=6000)
M_tfidf = vectorizer.fit_transform([' '.join(r) for r in reviews_tokens])

In [78]:
M_tfidf.shape

(100000, 6000)

In [79]:
ind_test = 111
print('text of test review: ', reviews_tokens[ind_test])
print('TFIDF of words:',
      '\nfigure:', M_tfidf.todense()[ind_test, vectorizer.vocabulary_['figure']],
      '\nlove:', M_tfidf.todense()[ind_test, vectorizer.vocabulary_['love']])

text of test review:  ['great', 'figure', 'right', 'price', 'love', 'amaze', 'figure', 'see']
TFIDF of words: 
figure: 0.6571834159729407 
love: 0.15079671030826042


In [80]:
with open('data/M_tfidf_10kx.pickle', 'wb') as handle:
    pickle.dump(M_tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('data/tfidf_vectorizer_10kx.pickle', 'wb') as handle:
    pickle.dump(vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
M_tfidf = None

#### Convert test string into TFIDF vector

In [58]:
vectorizer.transform([' '.join(review_tokens_test)])

<1x45595 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

## SVD & NMF

In [59]:
S_co_occurrence = f.build_co_occurrence_matrix(reviews_ind, n_tokens, window_size=4)
svd_reduced_co_occurrence = f.matrix_reduce(S_co_occurrence, method='svd', n_dim=5)

In [60]:
M_svd = np.stack([f.avg_svd_embeddings(ind, svd_reduced_co_occurrence, word2ind) for ind in reviews_ind])

In [61]:
M_svd

array([[1911.32207339,  -42.3008621 , -106.35712762,  210.05471684,
        -132.18560821],
       [3295.92654462,  -56.23929451, -170.39622825,  528.90670812,
        -241.31622704],
       [6867.38013304,  903.19521856, -400.92224674,  805.06355755,
        -199.5148494 ],
       ...,
       [3972.9144837 ,   30.86035388, -292.65034272,  950.86000508,
        -256.2331357 ],
       [1786.67156989, -840.48356118,    8.95530309,  -40.0365764 ,
        -613.58495123],
       [1798.51858078, -323.86781434,   12.39257531, -192.91452964,
         260.19561892]])

In [62]:
M_svd.shape

(100000, 5)

In [63]:
with open('data/M_svd_10k.pickle', 'wb') as handle:
    pickle.dump(M_svd, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
M_svd = None

In [64]:
nmf_reduced_co_occurrence = f.matrix_reduce(S_co_occurrence, method='nmf', n_dim=5)
M_nmf = np.stack([f.avg_svd_embeddings(ind, nmf_reduced_co_occurrence, word2ind) for ind in reviews_ind])



In [65]:
M_nmf

array([[ 5.60028164,  2.57640671,  2.69019925,  7.05719044,  2.88720161],
       [ 9.33532712,  3.50661315,  4.50025913, 13.47148637,  4.54616834],
       [12.98326499, 12.95530018, 15.3185457 , 29.26370138,  8.88202786],
       ...,
       [ 9.86596451,  3.95031209,  4.99392973, 18.75379105,  5.03843315],
       [12.51094802,  0.09438544,  0.11250818,  2.27895359,  1.28957891],
       [ 4.79368268,  2.69853506,  2.86355823,  2.46593873,  8.70588732]])

In [66]:
M_nmf.shape

(100000, 5)

In [67]:
with open('data/M_nmf_10k.pickle', 'wb') as handle:
    pickle.dump(M_nmf, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
M_nmf = None

#### Convert test string into SVD vector

In [68]:
f.avg_svd_embeddings(review_ind_test, svd_reduced_co_occurrence, word2ind)

array([4686.7329675 , -783.00008166,   60.1869311 , -652.26470709,
       1337.28247198])

## Word2Vec

In [69]:
w2v_model = Word2Vec(min_count=5,
                     window=3,
                     size=100,
                     workers=3)

In [70]:
w2v_model.build_vocab(reviews_tokens)

In [71]:
w2v_model.train(reviews_tokens, total_examples=w2v_model.corpus_count, epochs=30)

(42808281, 50905110)

In [72]:
M_word2vec = np.stack([f.avg_w2v_embeddings(review, w2v_model) for review in reviews_tokens])

In [73]:
M_word2vec.shape

(100000, 100)

In [74]:
with open('data/M_word2vec_10k.pickle', 'wb') as handle:
    pickle.dump(M_word2vec, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
M_word2vec = None

#### Convert test string into Word2Vec vector


In [75]:
f.avg_w2v_embeddings(review_tokens_test, w2v_model)

array([-0.5881122 , -0.09375161,  0.3112844 , -0.6094233 , -0.01624112,
       -0.35252064, -0.04945514,  1.472853  , -0.610626  ,  0.60974365,
        0.19933842, -0.34043843,  0.1133698 , -0.508508  , -0.4323098 ,
        0.30448467, -0.32596973, -0.9389588 , -0.37004104, -0.22721644,
       -0.62874615,  0.26884118, -0.42678374, -0.85855293, -0.06089675,
       -0.21786563, -0.06300454, -0.5797468 ,  0.25479466, -0.62527835,
       -0.35483462,  0.45905322,  0.30274215,  1.3049182 ,  1.3136592 ,
       -0.3332328 , -0.89768445, -0.8766676 , -0.16467395,  0.21572284,
       -0.3336801 , -0.67071676,  0.31391215, -1.3378918 , -0.45184407,
       -0.00443459,  1.0300862 ,  0.01396239, -0.23985538,  0.14696845,
        1.2677801 ,  0.0497361 , -0.91047704, -0.41100398,  0.54817307,
       -0.09938812,  0.20715058,  0.1469661 ,  0.32946473,  0.35771504,
       -0.14450638, -1.1337442 ,  0.5253822 , -0.81195945, -0.79146224,
        0.12864593,  0.13642989, -0.75450695, -0.14526239, -0.26