# Unsupervised sentiment analysis on reviews

In [1]:
#!pip install gensim
#!python -m spacy download pt_core_news_sm

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import re  # For preprocessing
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
import spacy  # For preprocessing
import logging  # Setting up the loggings to monitor gensim
import gensim
import spacy
import pt_core_news_sm
from gensim.models.phrases import Phrases, Phraser
import multiprocessing
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# Word2Vec

__Importer les données__

In [3]:
order_reviews = pd.read_csv("Data/olist_order_reviews_dataset.csv")

In [4]:
comments = order_reviews[["review_comment_message","review_score"]]

In [5]:
df = pd.DataFrame(comments.dropna())

In [6]:
df = df.reset_index(drop=True)

In [7]:
df.shape

(41753, 2)

In [8]:
nlp = pt_core_news_sm.load(disable=['ner','parser'])

In [9]:
nlp

<spacy.lang.pt.Portuguese at 0x1f296f14f48>

In [10]:
#modification manuelle de la liste des stop words
nlp.vocab["bem"].is_stop = False
nlp.vocab["boa"].is_stop = False
nlp.vocab["muito"].is_stop = False
nlp.vocab["muitos"].is_stop = False
nlp.vocab["menos"].is_stop = False
nlp.vocab["mal"].is_stop = False
nlp.vocab["bom"].is_stop = False
nlp.vocab["boa"].is_stop = False
nlp.vocab["sem"].is_stop = False
nlp.vocab["bem"].is_stop = False
nlp.vocab["obrigado"].is_stop = False
nlp.vocab["obrigada"].is_stop = False
nlp.vocab["o"].is_stop = True
nlp.vocab["e"].is_stop = True

In [11]:
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [12]:
#retrait des elements de moins de 3 caractères
brief_cleaning = (re.sub("[^A-Za-z']{3,}", ' ', str(row)).lower() for row in df['review_comment_message'])

In [13]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 1.81 mins


In [14]:
df_clean = pd.DataFrame({'clean': txt, 'rate':df.review_score})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(31161, 2)

In [16]:
sent = [row.split() for row in df_clean['clean']] # divise chaque commentaire en mots = liste de liste

In [17]:
sent[0]

['receber', 'bem', 'prazo', 'estipular']

In [18]:
phrases = Phrases(sent, min_count=30, progress_per=10000) #Creates the relevant phrases from the list of sentences:

In [19]:
bigram = Phraser(phrases) 
#he goal of Phraser() is to cut down memory consumption of Phrases(), by discarding model state not strictly needed for the bigram detection task

In [20]:
sentences = bigram[sent]

In [21]:
len(sentences)

31161

In [22]:
sentences[0]

['receber', 'bem', 'prazo_estipular']

In [23]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

11487

In [24]:
#mots les plus frequent
sorted(word_freq, key=word_freq.get, reverse=True)[:20]

['produto',
 'o',
 'entregar',
 'comprar',
 'prazo',
 'muito',
 'chegar',
 'receber',
 'bom',
 'vir',
 'recomendar',
 'loja',
 'pedir',
 'bem',
 'dia',
 'gostar',
 'qualidade',
 'rápido',
 'esperar',
 'excelente']

In [25]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [26]:
w2v_model = Word2Vec(min_count=100, # Ignores all words with total absolute frequency lower than this - (2, 100)
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [27]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.03 mins


In [28]:
len(w2v_model.wv.vocab.keys()) # Nombre de mots dans le vocabulaire

325

In [29]:
sentences[0]

['receber', 'bem', 'prazo_estipular']

In [30]:
w2v_model.wv.vocab.keys()

dict_keys(['receber', 'bem', 'prazo_estipular', 'parabém', 'loja', 'lannister', 'adorar', 'comprar', 'internet', 'o', 'feliz', 'aparelhar', 'eficiente', 'site', 'marcar', 'chegar', 'outro', 'corretar', 'bom', 'vendedor', 'confiável', 'produto', 'ok', 'entregar', 'prazo', 'gostar', 'haver', 'obrigar', 'realizar', 'muito', 'dar', 'usar', 'presentar', 'sem', 'problema', 'relógio', 'bonito', 'ocorrer', 'combinar', 'acreditar', 'stark', 'exatamente', 'esperar', 'encomendar', 'atrasar', 'super', 'otimo', 'atar', 'processar', 'tomar', 'q', 'controlo', 'faltar', 'ser', 'satisfeito', 'amar', 'achar', 'lindar', 'ótima', 'bem_embalar', 'qualidade', 'fretar', 'pq', 'casar', 'solicitar', 'unidade', 'vir', 'mim', 'foto', 'anúncio', 'inferior', 'mal', 'acabar', 'kit', 'mochila', 'pedir', 'reembolsar', 'respostar', 'parabens', 'conseguir', 'prometer', 'montar', 'vender', 'targaryen', 'tapetar', 'rápido', 'tecer', 'so', 'hoje', 'certar', 'bolsar', 'lindo', 'acordar', 'anunciar', 'super_rápido', 'demora

In [31]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 0.86 mins


In [32]:
w2v_model.init_sims(replace=True)

In [33]:
w2v_model.wv.vectors.shape

(325, 300)

In [34]:
w2v_model.wv.most_similar(positive=["defeito"])

[('quebrar', 0.9922207593917847),
 ('cabo', 0.9915614128112793),
 ('danificar', 0.9891204833984375),
 ('faltar', 0.9863587617874146),
 ('carro', 0.9827027320861816),
 ('funcionar', 0.9801025390625),
 ('lençol', 0.976325273513794),
 ('modelar', 0.9739847183227539),
 ('cadeira', 0.9726841449737549),
 ('controlo', 0.9692246317863464)]

# Il faut choisir parmit les deux kmeans, NE PAS LANCER LES DEUX

## 1 - K-means from scratch

In [35]:
zero = np.zeros((int(325/2),1))
one = np.ones((int(325/2)+1,1))
lab = np.concatenate((zero,one), axis=0)
data = np.concatenate((w2v_model.wv.vectors, lab), axis=1)
data = pd.DataFrame(data)

In [36]:
def centroid(data):
    var = data.columns
    m = len(var)
    l = []
    for i in range(m):
        l.append(var[i])
        l[i] = np.mean(data[var[i]])
    return l

In [37]:
def graphcentre(data,nb_class,nbvar):
    v = locals()
    cen = []
    for i in range(nb_class):
        d = data.where(data.iloc[:,-1]==i)
        v['c%d' % i] = centroid(d.iloc[:,:nbvar])
        cen.append(v['c%d' % i])
    return cen

In [38]:
def dist(data,centre):
    dist=0
    for i in range(len(data)):
        dist = dist + (centre[i]-data[i])**2
    dist = np.sqrt(dist)
    return dist

In [39]:
def kmean(data,nbvar,nb_class):
    l = 0
    m = 1
    count = 0
    while l != m:
        if count > 0:
            l = [sum(centre[i]) for i in range(nb_class)]
        centre = graphcentre(data,nb_class,nbvar)
        m = [sum(centre[i]) for i in range(nb_class)]
        di = []
        v = locals()
        for j in range(nb_class):
            v['dist%d' % j] = np.zeros((len(data),1))
            for i in range(len(data)):
                v['dist%d' % j][i] = dist(data.iloc[i,:nbvar],centre[j])
            di.append(v['dist%d' % j])
    
        label = [] 
        distance = []   
        for i in range(len(data)):
            sel = []
            for j in range(nb_class):
                element = di[j]
                sel.append(element[i])
            label.append(sel.index(np.min(sel)))
            distance.append(min(sel))

        data['predict_label'] = label
        count += 1
    data['distance'] = distance
    print(count)

In [40]:
kmean(data,50,2)

13


In [42]:
data['cluster_value'] = [-1 if i==0 else 1 for i in data.predict_label]
data['sentiment_coeff'] = data.distance * data.cluster_value
data['distance'] = data.distance.apply(lambda x: x[0])
data['sentiment_coeff'] = data.sentiment_coeff.apply(lambda x: x[0])
data['words'] = w2v_model.wv.vocab.keys()

In [44]:
sentiment_map = data[['words', 'sentiment_coeff']]

In [46]:
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

## 2 - K-means sklearn

In [None]:
from sklearn.cluster import KMeans

In [None]:
word_vectors = w2v_model.wv

In [None]:
word_vectors.vectors.shape

(186, 50)

In [None]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)

In [None]:
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [None]:
w2v_model.wv.similar_by_vector(model.cluster_centers_[1], topn=10,  restrict_vocab=None)

  if np.issubdtype(vec.dtype, np.int):


[('certo', 0.9963061809539795),
 ('expectativa', 0.996258556842804),
 ('melhor', 0.9929500818252563),
 ('acordar', 0.9922909736633301),
 ('prazo_estipular', 0.991629958152771),
 ('exatamente', 0.9910224676132202),
 ('certar', 0.9904945492744446),
 ('combinar', 0.9900484085083008),
 ('testar', 0.9883270263671875),
 ('ok', 0.9883203506469727)]

In [None]:
w2v_model.wv.vectors.shape[0]

186

In [None]:
words = pd.DataFrame(word_vectors.wv.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
words['cluster_value'] = [-1 if i==0 else 1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [None]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

In [None]:
sentiment_map = words[['words', 'sentiment_coeff']]
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

# tfidf

In [48]:
file_weighting = df_clean.copy()

In [49]:
file_weighting = file_weighting.reset_index(drop=True)

In [51]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.clean)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.clean)

In [53]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)  
    return list(map(lambda y:dictionary[f'{y}'], x.clean.split()))

In [54]:
%%time
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)
#this step takes around 3-4 minutes minutes to calculate

Wall time: 19.4 s


In [56]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [57]:
replaced_closeness_scores = file_weighting.clean.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [60]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.clean, file_weighting.rate]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'Rate']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
replacement_df['sentiment'] = [1 if i>=2.5 else 0 for i in replacement_df.Rate]

In [61]:
predicted_classes = replacement_df.prediction
y_test = replacement_df.sentiment

conf_matrix = pd.DataFrame(confusion_matrix(replacement_df.sentiment, replacement_df.prediction))
print('Confusion Matrix')
display(conf_matrix)

test_scores = accuracy_score(y_test,predicted_classes), precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)

print('\n \n Scores')
scores = pd.DataFrame(data=[test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1']
scores = scores.T
scores.columns = ['scores']
display(scores)

Confusion Matrix


Unnamed: 0,0,1
0,2348,7593
1,6469,14751



 
 Scores


Unnamed: 0,scores
accuracy,0.548731
precision,0.660177
recall,0.695146
f1,0.677211
