In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

import functions as f



## Functions

word -> text -> text_corpus
token -> text_token -> tokens_corpus
ind -> text_ind -> ind_corpus
corpus_counter

text: string ze słowami -> word

text_token: lista z tokenami -> token

text_ind: lista z indexami tokenów -> ind

text_corpus: lista stringów ze słowami

tokens_corpus: lista list z tokenami

corpus_counter: słownik ze słowami i częstotliwościami

In [2]:
translation_table = str.maketrans('', '', string.punctuation)

def remove_stop_words(text_token):
    return [token for token in text_token if token not in stop_words.ENGLISH_STOP_WORDS]

def lem_words(text_token):
    lem = stem.WordNetLemmatizer()
    return [lem.lemmatize(token, pos='v') for token in text_token]

def distinct_corpus_words(tokens_corpus):
    tokens_corpus_flatten = [token for tokens in tokens_corpus for token in tokens]
    corpus_counter = Counter(tokens_corpus_flatten).most_common()
    return corpus_counter, len(corpus_counter)

In [3]:
def normalize_single_text(text, translation_table=translation_table):
    text = str(text).lower()
    text = text.translate(translation_table)

    text_tokens = nltk.word_tokenize(text)
    text_tokens = remove_stop_words(text_tokens)
    text_tokens = lem_words(text_tokens)
    
    return text_tokens

def normalize_text(text_corpus, translation_table=translation_table):
    return [normalize_single_text(text, translation_table=translation_table) for text in text_corpus]

In [4]:
def build_dictionary(corpus_counter):
    word2ind = {corpus_counter[i][0]: i for i in range(len(corpus_counter))}
    ind2word = [word[0] for word in corpus_counter]
    return word2ind, ind2word

def text_token2ind(text_token, word2ind):
    return [word2ind[token] for token in text_token if token in word2ind]

In [5]:
def build_bow(ind_corpus, n_tokens, max_features=None):
    """Max features takes n features with the lowest index - assumes that lower index -> higher number of occurrences"""
    
    if max_features:
        ind_corpus = [[ind for ind in text_ind if ind < max_features] for text_ind in ind_corpus]
        n_tokens = max_features

    values = []
    col_ind = []
    row_pointer = [0]

    for features in ind_corpus:
        feature_counter = Counter(features)
        col_ind.extend(feature_counter.keys())
        values.extend(feature_counter.values())
        row_pointer.append(len(values))

    S = scipy.sparse.csr_matrix((values, col_ind, row_pointer),
                                       shape=(len(row_pointer) - 1, n_tokens),
                                       dtype=int)
    
    return S

In [6]:
def build_single_bow(text_ind, n_tokens, max_features=None):
    if max_features:
        text_ind = [ind for ind in text_ind if ind < max_features]
        n_tokens = max_features

    single_bow = np.zeros(n_tokens)
    for ind in text_ind:
        single_bow[ind] += 1
    
    return single_bow

In [7]:
def build_co_occurrence_matrix(ind_corpus, n_tokens, window_size=4):
    row_ind = []
    col_ind = []
    values = []

    for text_ind in ind_corpus:
        for i in range(len(text_ind)):
            for j in range(max(i-window_size, 0), min(i + window_size + 1, len(text_ind))):
                if i != j:
                    row_ind.extend([text_ind[i]])
                    col_ind.extend([text_ind[j]])
                    values.extend([1])
    
    S = scipy.sparse.coo_matrix((values, (row_ind, col_ind)), shape=(n_tokens, n_tokens))

    return S

def matrix_reduce(M, method, n_dim=2, n_iter=10):
    
    try:
        if method == 'svd':
            decomposition = TruncatedSVD(n_components=n_dim, n_iter=n_iter)
        elif method == 'nmf':
            decomposition = NMF(n_components=n_dim)
        
        M_reduced = decomposition.fit_transform(M)
        return M_reduced
        
    except UnboundLocalError:
        print('Choose either svd or nmf method')

def avg_svd_embeddings(text_ind, reduced_co_occurrence_matrix, word2ind):
    i = len(text_ind)

    if i>=1:
        return sum([reduced_co_occurrence_matrix[ind] for ind in text_ind])/i
    return np.zeros(reduced_co_occurrence_matrix.shape[1])

In [8]:
def avg_w2v_embeddings(text_token, w2v_model):
    words = [token for token in text_token if token in w2v_model.wv.vocab]
    if len(words)>=1:
        return np.mean(w2v_model.wv[words], axis=0)
    return np.zeros(w2v_model.trainables.layer1_size)

### Text preprocessing

Creating word tokens

In [2]:
df = pd.read_csv('data/reviews_toys_games.csv')
df.sample(5)

Unnamed: 0,review,sentiment
874322,EXCELLENT PRODUCT AND PRICE,1
381124,These are really cute. The more you get in on...,1
1088190,Ordered for my granddaughter for christmas gif...,1
977103,Play-doh gives hours of fun. My kids can liter...,1
1155475,Very Pretty dice. Fun to play games with color...,1


In [3]:
sum(df['sentiment'])/len(df)

0.914674872933972

In [4]:
#reviews = df['review'].to_list()
#sentiment = df['sentiment'].to_list()

reviews = df['review'][:10000].to_list()
sentiment = df['sentiment'][:10000].to_list()

In [5]:
reviews_tokens = f.normalize_text(reviews)

In [6]:
corpus_counter, n_tokens = f.distinct_corpus_words(reviews_tokens)

print('Words in the dictionary: ', n_tokens)
print('Most common words: ', corpus_counter[:10])

Words in the dictionary:  13656
Most common words:  [('game', 9749), ('play', 5682), ('love', 4328), ('great', 3316), ('fun', 3309), ('kid', 2252), ('card', 2023), ('like', 2002), ('old', 1913), ('time', 1819)]


In [19]:
with open('data/reviews_tokens_10k.pickle', 'wb') as handle:
    pickle.dump(reviews_tokens, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('data/sentiment_10k.pickle', 'wb') as handle:
    pickle.dump(sentiment, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
df = None
reviews = None
reviews_tokens = None
sentiment = None

## Creating word vectors

In [9]:
with open('data/reviews_tokens_10k.pickle', 'rb') as handle:
    reviews_tokens = pickle.load(handle)

In [7]:
corpus_counter, n_tokens = f.distinct_corpus_words(reviews_tokens)
word2ind, ind2word = f.build_dictionary(corpus_counter)
reviews_ind = [f.text_token2ind(review, word2ind) for review in reviews_tokens]

### Bag of words

In [8]:
S_bow = f.build_bow(reviews_ind, n_tokens, 6000)

In [9]:
S_bow.shape

(10000, 6000)

In [10]:
ind_test = 15
print('text of test review: ', reviews_tokens[ind_test])
print('occurrences of words:',
      '\nbook:', S_bow.todense()[ind_test, word2ind['book']],
      '\nreally:', S_bow.todense()[ind_test, word2ind['really']])

text of test review:  ['ok', 'really', 'consider', 'book', 'really', 'small', 'disappoint']
occurrences of words: 
book: 1 
really: 2


In [14]:
with open('data/M_bow_10k.pickle', 'wb') as handle:
    pickle.dump(S_bow, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
S_bow = None

#### Convert string to BOW vector

In [11]:
review_test = 'This game is amazing ^^, my son plays with it all the time! popolsku behavoir'

In [12]:
review_tokens_test = f.normalize_single_text(review_test)
print(review_tokens_test)

review_ind_test = f.text_token2ind(review_tokens_test, word2ind)
print(review_ind_test)

['game', 'amaze', 'son', 'play', 'time', 'popolsku', 'behavoir']
[0, 221, 30, 1, 9, 7999]


In [13]:
f.build_single_bow(review_ind_test, n_tokens, 6000)

array([1., 1., 0., ..., 0., 0., 0.])

## TFIDF

In [14]:
vectorizer = TfidfVectorizer()
M_tfidf = vectorizer.fit_transform([' '.join(r) for r in reviews_tokens])

In [15]:
M_tfidf.shape

(10000, 13622)

In [16]:
ind_test = 15
print('text of test review: ', reviews_tokens[ind_test])
print('TFIDF of words:',
      '\nbook:', M_tfidf.todense()[ind_test, vectorizer.vocabulary_['book']],
      '\nreally:', M_tfidf.todense()[ind_test, vectorizer.vocabulary_['really']])

text of test review:  ['ok', 'really', 'consider', 'book', 'really', 'small', 'disappoint']
TFIDF of words: 
book: 0.29574860929046415 
really: 0.4896422737221947


In [18]:
with open('data/M_tfidf_10k.pickle', 'wb') as handle:
    pickle.dump(M_tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
M_tfidf = None

#### Convert test string into TFIDF vector

In [17]:
vectorizer.transform([' '.join(review_tokens_test)])

<1x13622 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

## SVD & NMF

In [18]:
S_co_occurrence = f.build_co_occurrence_matrix(reviews_ind, n_tokens, window_size=4)
svd_reduced_co_occurrence = f.matrix_reduce(S_co_occurrence, method='svd', n_dim=5)

In [19]:
M_svd = np.stack([f.avg_svd_embeddings(ind, svd_reduced_co_occurrence, word2ind) for ind in reviews_ind])

In [20]:
M_svd

array([[ 5.14323424e+01, -3.47109825e+00, -2.67020404e+00,
         2.03974543e-01,  1.48255302e+01],
       [ 2.16992669e+02,  1.05707348e+01, -1.46835127e+01,
        -6.84907300e-01,  3.10134179e+01],
       [ 4.55199337e+02, -3.17093833e+01, -8.54671822e+01,
         1.60653564e+00,  2.72084063e+01],
       ...,
       [ 2.63608525e+03, -1.08866951e+02,  2.31153097e+02,
        -1.73853227e+01, -1.02893709e+02],
       [ 6.57657544e+01, -6.06839347e+00, -9.53952804e+00,
         1.57869204e+00,  2.58041369e+01],
       [ 1.43775318e+02,  7.27239290e+01,  2.55859105e+01,
        -2.68445857e+00,  9.12218545e+00]])

In [21]:
M_svd.shape

(10000, 5)

In [22]:
with open('data/M_svd_10k.pickle', 'wb') as handle:
    pickle.dump(M_svd, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
M_svd = None

In [22]:
nmf_reduced_co_occurrence = f.matrix_reduce(S_co_occurrence, method='nmf', n_dim=5)
M_nmf = np.stack([f.avg_svd_embeddings(ind, nmf_reduced_co_occurrence, word2ind) for ind in reviews_ind])

In [23]:
M_nmf

array([[3.18080617e-01, 7.80464327e-02, 1.49065280e-01, 9.30239777e-02,
        5.36790040e-01],
       [1.49840854e+00, 6.20963960e-01, 6.69917415e-01, 7.86717645e-01,
        1.45379772e+00],
       [4.12534986e+00, 6.89270575e-01, 1.12965690e+00, 9.85722278e-01,
        2.46493945e+00],
       ...,
       [1.94969517e+01, 2.88667247e+00, 2.19527459e+01, 3.38141979e+00,
        4.91998215e+00],
       [3.96732276e-01, 1.17008949e-01, 1.69634013e-02, 1.07129897e-01,
        8.91329126e-01],
       [5.45359060e-01, 1.15693447e+00, 7.93880674e-01, 1.35079064e+00,
        2.99975201e-01]])

In [24]:
M_nmf.shape

(10000, 5)

In [25]:
with open('data/M_nmf_10k.pickle', 'wb') as handle:
    pickle.dump(M_nmf, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
M_nmf = None

#### Convert test string into SVD vector

In [25]:
f.avg_svd_embeddings(review_ind_test, svd_reduced_co_occurrence, word2ind)

array([1774.31154901,  -57.54346441,   80.98033976,  -11.74561109,
        -43.33517736])

## Word2Vec

In [26]:
w2v_model = Word2Vec(min_count=5,
                     window=3,
                     size=100,
                     workers=3)

In [27]:
w2v_model.build_vocab(reviews_tokens)

In [28]:
w2v_model.train(reviews_tokens, total_examples=w2v_model.corpus_count, epochs=30)

(5682288, 7245750)

In [29]:
M_word2vec = np.stack([f.avg_w2v_embeddings(review, w2v_model) for review in reviews_tokens])

In [30]:
M_word2vec.shape

(10000, 100)

In [31]:
with open('data/M_word2vec_10k.pickle', 'wb') as handle:
    pickle.dump(M_word2vec, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
M_word2vec = None

#### Convert test string into SVD vector


In [31]:
f.avg_w2v_embeddings(review_tokens_test, w2v_model)

array([-0.4916602 , -0.673176  , -0.43748522,  0.45790607, -0.5072142 ,
        0.25582296,  0.5711106 , -0.06258135,  0.36200437,  0.740065  ,
        0.04109208,  0.58028495, -0.40537295,  0.6922718 ,  0.23685701,
        0.54663134,  0.09706992, -0.14363842, -0.30137047, -0.12514031,
        0.51345617,  0.37067157,  0.11115031,  0.04619621,  0.58541983,
       -0.35716337, -0.43209654,  0.48820734, -0.20929281, -0.04797919,
        0.4365302 , -0.46987385,  0.01353803, -0.04874829,  0.3549533 ,
       -0.27242514, -0.14320196, -0.40812597, -0.19990039,  0.42948112,
        0.5049257 , -0.10161964, -0.5768609 ,  0.39754462, -0.24381408,
        0.33620033,  0.5084406 , -0.25106287,  0.04437397,  0.4186307 ,
        0.7601339 , -0.5622204 , -0.32498673, -0.12162588, -0.11994795,
       -0.12674376,  0.6258924 , -0.18641171, -0.02631643,  0.0009281 ,
       -0.4873394 ,  0.17256328,  0.32977718,  0.12279141, -0.37370855,
        0.07469588, -0.07714178,  0.07089421, -0.44723797,  0.02