In [1]:
import numpy as np
import pandas as pd
import string
import pickle
from collections import Counter
import scipy

from sklearn.feature_extraction import stop_words
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk import stem

from gensim.models import Word2Vec



## Functions

word -> text -> text_corpus
token -> text_token -> tokens_corpus
ind -> text_ind -> ind_corpus
corpus_counter

text: string ze słowami -> word

text_token: lista z tokenami -> token

text_ind: lista z indexami tokenów -> ind

text_corpus: lista stringów ze słowami

tokens_corpus: lista list z tokenami

corpus_counter: słownik ze słowami i częstotliwościami

In [2]:
translation_table = str.maketrans('', '', string.punctuation)

def remove_stop_words(text_token):
    return [token for token in text_token if token not in stop_words.ENGLISH_STOP_WORDS]

def lem_words(text_token):
    lem = stem.WordNetLemmatizer()
    return [lem.lemmatize(token, pos='v') for token in text_token]

def distinct_corpus_words(tokens_corpus):
    tokens_corpus_flatten = [token for tokens in tokens_corpus for token in tokens]
    corpus_counter = Counter(tokens_corpus_flatten).most_common()
    return corpus_counter, len(corpus_counter)

In [3]:
def normalize_single_text(text, translation_table=translation_table):
    text = str(text).lower()
    text = text.translate(translation_table)

    text_tokens = nltk.word_tokenize(text)
    text_tokens = remove_stop_words(text_tokens)
    text_tokens = lem_words(text_tokens)
    
    return text_tokens

def normalize_text(text_corpus, translation_table=translation_table):
    return [normalize_single_text(text, translation_table=translation_table) for text in text_corpus]

In [4]:
def build_dictionary(corpus_counter):
    word2ind = {corpus_counter[i][0]: i for i in range(len(corpus_counter))}
    ind2word = [word[0] for word in corpus_counter]
    return word2ind, ind2word

def text_token2ind(text_token, word2ind):
    return [word2ind[token] for token in text_token if token in word2ind]

In [5]:
def build_bow(ind_corpus, n_tokens, max_features=None):
    """Max features takes n features with the lowest index - assumes that lower index -> higher number of occurrences"""
    
    if max_features:
        ind_corpus = [[ind for ind in text_ind if ind < max_features] for text_ind in ind_corpus]
        n_tokens = max_features

    values = []
    col_ind = []
    row_pointer = [0]

    for features in ind_corpus:
        feature_counter = Counter(features)
        col_ind.extend(feature_counter.keys())
        values.extend(feature_counter.values())
        row_pointer.append(len(values))

    S = scipy.sparse.csr_matrix((values, col_ind, row_pointer),
                                       shape=(len(row_pointer) - 1, n_tokens),
                                       dtype=int)
    
    return S

In [6]:
def build_single_bow(text_ind, n_tokens, max_features=None):
    if max_features:
        text_ind = [ind for ind in text_ind if ind < max_features]
        n_tokens = max_features

    single_bow = np.zeros(n_tokens)
    for ind in text_ind:
        single_bow[ind] += 1
    
    return single_bow

In [7]:
def build_co_occurrence_matrix(ind_corpus, n_tokens, window_size=4):
    row_ind = []
    col_ind = []
    values = []

    for text_ind in ind_corpus:
        for i in range(len(text_ind)):
            for j in range(max(i-window_size, 0), min(i + window_size + 1, len(text_ind))):
                if i != j:
                    row_ind.extend([text_ind[i]])
                    col_ind.extend([text_ind[j]])
                    values.extend([1])
    
    S = scipy.sparse.coo_matrix((values, (row_ind, col_ind)), shape=(n_tokens, n_tokens))

    return S

def matrix_reduce(M, method, n_dim=2, n_iter=10):
    
    try:
        if method == 'svd':
            decomposition = TruncatedSVD(n_components=n_dim, n_iter=n_iter)
        elif method == 'nmf':
            decomposition = NMF(n_components=n_dim)
        
        M_reduced = decomposition.fit_transform(M)
        return M_reduced
        
    except UnboundLocalError:
        print('Choose either svd or nmf method')

def avg_svd_embeddings(text_ind, reduced_co_occurrence_matrix, word2ind):
    i = len(text_ind)

    if i>=1:
        return sum([reduced_co_occurrence_matrix[ind] for ind in text_ind])/i
    return np.zeros(reduced_co_occurrence_matrix.shape[1])

In [8]:
def avg_w2v_embeddings(text_token, w2v_model):
    words = [token for token in text_token if token in w2v_model.wv.vocab]
    if len(words)>=1:
        return np.mean(w2v_model.wv[words], axis=0)
    return np.zeros(w2v_model.trainables.layer1_size)

### Text preprocessing

Creating word tokens

In [14]:
df = pd.read_csv('data/reviews_toys_games.csv')
df.sample(5)

Unnamed: 0,review,sentiment
737813,Another great doll added to our collection. Du...,1
1321082,As good as old.,1
1232672,This product works great on my RC car would de...,1
1118333,Love these. Put them away for x-mas,1
440936,Still loving it after two years,1


In [15]:
sum(df['sentiment'])/len(df)

0.914674872933972

In [16]:
#reviews = df['review'].to_list()
#sentiment = df['sentiment'].to_list()

reviews = df['review'][:10000].to_list()
sentiment = df['sentiment'][:10000].to_list()

In [17]:
reviews_tokens = normalize_text(reviews)

In [18]:
corpus_counter, n_tokens = distinct_corpus_words(reviews_tokens)

print('Words in the dictionary: ', n_tokens)
print('Most common words: ', corpus_counter[:10])

Words in the dictionary:  236419
Most common words:  [('love', 613117), ('great', 385104), ('play', 288507), ('toy', 242846), ('old', 228996), ('like', 218309), ('buy', 202374), ('kid', 196021), ('game', 193122), ('fun', 190516)]


In [19]:
with open('data/reviews_tokens_10k.pickle', 'wb') as handle:
    pickle.dump(reviews_tokens, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('data/sentiment_10k.pickle', 'wb') as handle:
    pickle.dump(sentiment, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
df = None
reviews = None
reviews_tokens = None
sentiment = None

## Creating word vectors

In [9]:
with open('data/reviews_tokens_10k.pickle', 'rb') as handle:
    reviews_tokens = pickle.load(handle)

In [10]:
corpus_counter, n_tokens = distinct_corpus_words(reviews_tokens)
word2ind, ind2word = build_dictionary(corpus_counter)
reviews_ind = [text_token2ind(review, word2ind) for review in reviews_tokens]

### Bag of words

In [11]:
S_bow = build_bow(reviews_ind, n_tokens, 6000)

In [12]:
S_bow.shape

(1426227, 6000)

In [13]:
ind_test = 15
print('text of test review: ', reviews_tokens[ind_test])
print('occurrences of words:',
      '\nbook:', S_bow.todense()[ind_test, word2ind['book']],
      '\nreally:', S_bow.todense()[ind_test, word2ind['really']])

text of test review:  ['ok', 'really', 'consider', 'book', 'really', 'small', 'disappoint']
occurrences of words: 
book: 1 
really: 2


In [14]:
with open('data/M_bow_10k.pickle', 'wb') as handle:
    pickle.dump(S_bow, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
S_bow = None

#### Convert string to BOW vector

In [15]:
review_test = 'This game is amazing ^^, my son plays with it all the time! popolsku behavoir'

In [16]:
review_tokens_test = normalize_single_text(review_test)
print(review_tokens_test)

review_ind_test = text_token2ind(review_tokens_test, word2ind)
print(review_ind_test)

['game', 'amaze', 'son', 'play', 'time', 'popolsku', 'behavoir']
[0, 221, 30, 1, 9, 7999]


In [17]:
build_single_bow(review_ind_test, n_tokens, 6000)

array([1., 1., 0., ..., 0., 0., 0.])

## TFIDF

In [15]:
vectorizer = TfidfVectorizer()
M_tfidf = vectorizer.fit_transform([' '.join(r) for r in reviews_tokens])

In [16]:
M_tfidf.shape

(1426227, 236378)

In [17]:
ind_test = 15
print('text of test review: ', reviews_tokens[ind_test])
print('TFIDF of words:',
      '\nbook:', M_tfidf.todense()[ind_test, vectorizer.vocabulary_['book']],
      '\nreally:', M_tfidf.todense()[ind_test, vectorizer.vocabulary_['really']])

text of test review:  ['ok', 'really', 'consider', 'book', 'really', 'small', 'disappoint']


MemoryError: Unable to allocate 2.45 TiB for an array with shape (1426227, 236378) and data type float64

In [18]:
with open('data/M_tfidf_10k.pickle', 'wb') as handle:
    pickle.dump(M_tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
M_tfidf = None

#### Convert test string into TFIDF vector

In [22]:
vectorizer.transform([' '.join(review_tokens_test)])

<1x13622 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

## SVD & NMF

In [19]:
S_co_occurrence = build_co_occurrence_matrix(reviews_ind, n_tokens, window_size=4)
svd_reduced_co_occurrence = matrix_reduce(S_co_occurrence, method='svd', n_dim=5)

In [20]:
M_svd = np.stack([avg_svd_embeddings(ind, svd_reduced_co_occurrence, word2ind) for ind in reviews_ind])

In [34]:
M_svd

array([[ 5.14323424e+01, -3.47109825e+00, -2.67020404e+00, ...,
        -1.88757093e+00,  6.84419604e-02,  4.51224859e+00],
       [ 2.16992669e+02,  1.05707348e+01, -1.46835127e+01, ...,
        -3.77584575e+00, -4.01213756e+00, -3.71439386e-01],
       [ 4.55199337e+02, -3.17093833e+01, -8.54671822e+01, ...,
         1.39537457e+01,  6.02229246e+00,  6.95342963e+00],
       ...,
       [ 2.63608525e+03, -1.08866951e+02,  2.31153097e+02, ...,
         4.89310910e+01, -1.53355284e+01, -1.04527500e+01],
       [ 6.57657544e+01, -6.06839347e+00, -9.53952804e+00, ...,
        -8.10784544e+00, -5.53772244e-01,  5.31086296e+00],
       [ 1.43775318e+02,  7.27239290e+01,  2.55859105e+01, ...,
         3.76106324e+01,  2.49329050e+00, -1.01655368e+01]])

In [21]:
M_svd.shape

(1426227, 5)

In [22]:
with open('data/M_svd_10k.pickle', 'wb') as handle:
    pickle.dump(M_svd, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
M_svd = None

In [23]:
nmf_reduced_co_occurrence = matrix_reduce(S_co_occurrence, method='nmf', n_dim=5)
M_nmf = np.stack([avg_svd_embeddings(ind, nmf_reduced_co_occurrence, word2ind) for ind in reviews_ind])

In [38]:
M_nmf

array([[ 0.21164795,  0.03531371,  0.31712811, ...,  0.02142679,
         0.07663577,  0.59293864],
       [ 0.7533052 ,  0.36481111,  1.62439824, ...,  0.16128918,
         0.473102  ,  1.15764214],
       [ 1.09361196,  0.25399942,  5.83577518, ...,  0.36386534,
         0.74307879,  2.78276504],
       ...,
       [10.79696704,  2.59788942, 17.33681545, ...,  8.44909389,
         2.12239207,  1.49585677],
       [ 0.42627795,  0.        ,  0.37320601, ...,  0.        ,
         0.09127283,  1.41276781],
       [ 0.        ,  0.7969728 ,  0.90182256, ...,  0.60171264,
         0.37780882,  0.69107105]])

In [24]:
M_nmf.shape

(1426227, 5)

In [25]:
with open('data/M_nmf_10k.pickle', 'wb') as handle:
    pickle.dump(M_nmf, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
M_nmf = None

#### Convert test string into SVD vector

In [28]:
avg_svd_embeddings(review_ind_test, S_reduced_co_occurrence, word2ind)

array([ 1.77431155e+03, -5.75434644e+01,  8.09803398e+01, -1.17456111e+01,
       -4.33351774e+01,  3.25419129e+01,  5.43330545e+01, -2.48050289e+01,
        1.50935161e+00, -2.34834456e+01])

## Word2Vec

In [26]:
w2v_model = Word2Vec(min_count=5,
                     window=3,
                     size=100,
                     workers=3)

In [27]:
w2v_model.build_vocab(reviews_tokens)

In [28]:
w2v_model.train(reviews_tokens, total_examples=w2v_model.corpus_count, epochs=30)

(625688996, 726801870)

In [29]:
M_word2vec = np.stack([avg_w2v_embeddings(review, w2v_model) for review in reviews_tokens])

In [30]:
M_word2vec.shape

(1426227, 100)

In [31]:
with open('data/M_word2vec_10k.pickle', 'wb') as handle:
    pickle.dump(M_word2vec, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
M_word2vec = None

#### Convert test string into SVD vector


In [35]:
avg_w2v_embeddings(review_tokens_test, w2v_model)

array([-0.42165866, -0.19917676,  0.44199872,  0.6969125 ,  0.5052937 ,
       -0.05001388,  0.23804888, -0.16238654,  0.75656104,  0.10875399,
       -0.03664295,  0.22321317,  0.3091781 , -0.42105788,  0.6086213 ,
       -0.31654102, -0.65119016,  0.45273328,  0.01955453,  0.03324833,
        0.02535819, -0.10822463,  0.36700904, -0.3750741 , -0.27163497,
        0.33239564,  0.04524777, -0.00577122, -0.08206176, -0.5412153 ,
        0.09901704, -0.33308092,  0.09012656,  0.4178986 ,  0.2539987 ,
       -0.6296872 ,  0.80539304, -0.12282996,  0.23620553, -0.19840494,
       -0.03773532, -0.6750927 ,  0.24422427,  0.5969204 ,  0.41765147,
        0.29813904,  0.59395635, -0.39825255, -0.27801678, -0.4181188 ,
        0.13290724, -0.42702326,  0.03935174, -0.27779803,  0.11059706,
        0.35624462, -0.4419004 ,  0.1847095 , -0.19821852,  0.1428484 ,
       -0.49165335, -0.23037104, -0.01880818,  0.90063465,  0.27459517,
        0.10313277,  0.06069721,  0.3767677 ,  0.16848317,  0.03