In [2]:
#gestion des exceptions
import warnings
warnings.filterwarnings('ignore')

In [4]:
#gestion des répertoires
import os

#pour le chargement des données depuis notre espace S3 AWS
import boto3

#modules de base pour la science des données
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#gestion et preprocessing des données textuelles
import string
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download("stopwords")
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

#pour la réalisation des nuages de mots
from wordcloud import WordCloud, STOPWORDS

#pour l'entraînement réseau de neuronnes artificielles pour le sentiment analysis
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelWithLMHead
import torch.optim as optim

from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer #Tokenizer pour bert
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup


#pour la séparation des données afin d'estimer l'erreur de généralisation
from sklearn.model_selection import train_test_split

#pour le calcul de la métrique utilisée pour évaluer le classifieur pour le sentiment analysis
from sklearn.metrics import f1_score

#pour la bar de progression
from tqdm.notebook import tqdm

[nltk_data] Downloading package stopwords to /home/lasme/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Transfert

Peut-on employer les modèles que nous avons appris pour l'analyse de sentiments sur les produits de la categorie "musique digital" à d'autres categories de produits sur Amazon ? Dans cette partie, nous appliquons notre LSTM pour la prédiction de sentiments sur quelques instances ou commentaires de la catégorie de produits "livres" pour laquelle il n'a pas été entrainé.

In [11]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob = 0.5):
        super(SentimentLSTM, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first = True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, output_size) # fully connected
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long() # cast to long tensor
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

In [13]:
vocab_size = 13688
output_size = 1
embedding_dim = 400 
hidden_dim = 512 
n_layers = 2

model = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

model.load_state_dict(torch.load('best_LSTM_model.pt'))
model.eval()
print(model)

SentimentLSTM(
  (embedding): Embedding(13688, 400)
  (lstm): LSTM(400, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [17]:
import pandas as pd
import gzip
import json

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Movies_and_TV_5.json.gz')

In [18]:
df.to_csv("Movies_TV.csv", index = False)

In [19]:
df = df.sample(1000)

In [21]:
df.to_csv("Movies_TV_low.csv", index = False)

In [22]:


df = df.drop(labels = ["verified", "reviewerID", "asin" ,"style","reviewerName", "unixReviewTime","vote", "image"],axis = 1)
df.head()


Unnamed: 0,overall,reviewTime,reviewText,summary
1312137,5.0,"04 14, 2015",We love this whole series. Well made by the Hi...,Well Made
3345026,5.0,"01 8, 2016",Great!,Five Stars
833505,4.0,"12 27, 2010","The first time I saw this, I thought, ""meh."" ...",Not as bad as people claim
553086,5.0,"11 30, 2016",Very clear copy. Well worth the price.,Five Stars
2519385,3.0,"09 4, 2014","Good movie, but...some what predictable @ cert...",Three Stars


In [23]:


def remove_punctuation(text):
    no_punct = [words for words in text if words not in string.punctuation]
    words_wo_punct =''.join(no_punct)
    return words_wo_punct

df["reviewText"] = df["reviewText"].astype(str)
df["reviewText"] = df["reviewText"].apply(lambda x: remove_punctuation(x.lower()))



In [24]:
def text_process(df):
    # racinisation
    stemmer = PorterStemmer()
    # élimination des stop words anglais comme "the, I, our etc"
    words = stopwords.words("english") 
    df['cleaned_reviews'] = df['reviewText'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())
    print("Prétraitement des données en cours, patientez encore ...")
    print("===============================================================")
    print("...")
    print("...")
    print("...")
    print("Prétraitement terminé !")
    return df

df = text_process(df)


Prétraitement des données en cours, patientez encore ...
...
...
...
Prétraitement terminé !


In [26]:
def classify(x):
    if x == 5.0 or x == 4.0:
        return 2 #positif
    if x == 3.0:
        return 1 #neutre
    return 0 #négatif

df["y"] = df["overall"].apply(classify)

In [27]:
def classify(x):
    if x == 0 or x == 1:
        return 0 #négatif
    return 1 #positif

df["y"] = df["y"].apply(classify)
df["y"].value_counts()


1    778
0    222
Name: y, dtype: int64

In [29]:
df_balanced=df

df_balanced["cleaned_reviews"] = df_balanced["cleaned_reviews"].astype("str")
all_words = [w for s in df_balanced.cleaned_reviews.values.tolist() for w in s.split()]
words = Counter(all_words)
words = {k:v for k,v in words.items() if v > 1}
words = sorted(words, key = words.get, reverse = True)
words = ['_PAD', '_UNK'] + words



In [31]:
word_to_idx = { w : i for i,w in enumerate(words) }
idx_to_word = {v : k for k,v in word_to_idx.items()}

def process_sequence(text_sequence):
    global word_to_idx
    res = []
    for w in text_sequence:
        if w in word_to_idx:
            res.append(word_to_idx[w])
        else:
            res.append(1)
    return res

X = [process_sequence(s.split()) for s in df_balanced.cleaned_reviews.values.tolist()]
y = df_balanced.y.values.tolist()

#padding des données
MAX_LEN = max([len(s) for s in  X])
X = pad_sequences(X, maxlen = MAX_LEN, padding = 'pre')
y = np.array(y)

In [32]:
#formattage des données train et test pour pytorch
test_data = TensorDataset(torch.from_numpy(X), torch.from_numpy(y))

###taille du batch
batch_size = 32 #gérer la taille du batch en fonction de ses capacités GPU, chez le max c'est 32 pour cette architecture

test_loader = DataLoader(test_data, shuffle = True, batch_size = batch_size, drop_last = True)


In [36]:
lr = 0.005
criterion = nn.BCELoss() # binary cross entropy loss
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [37]:
device = torch.device("cpu")
test_losses = []
num_correct = 0
h = model.init_hidden(batch_size)
y_pred = []
y_true = []

model.eval()
for inputs, labels in test_loader:
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, h = model(inputs, h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    pred = torch.round(output.squeeze()) #rounds the output to 0/1´
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)

    y_pred.extend(pred.tolist())
    y_true.extend(labels.tolist())
        
print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}%".format(test_acc*100))


Test loss: 0.866
Test accuracy: 56.000%
