Pre-processing des données textuelles

In [21]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('stopwords')
nltk.download('punkt')

df = pd.read_csv('movie_review.csv') 

stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.lower() not in punctuation]
    tokens = [word for word in tokens if word not in stop_words]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text


df['Preprocessed_Text'] = df['text'].apply(preprocess_text)

print(df[['text', 'Preprocessed_Text']])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ANKRI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ANKRI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                    text  \
0      films adapted from comic books have had plenty...   
1      for starters , it was created by alan moore ( ...   
2      to say moore and campbell thoroughly researche...   
3      the book ( or " graphic novel , " if you will ...   
4      in other words , don't dismiss this film becau...   
...                                                  ...   
64715  that lack of inspiration can be traced back to...   
64716  like too many of the skits on the current inca...   
64717  after watching one of the " roxbury " skits on...   
64718   bump unsuspecting women , and . . . that's all .   
64719  after watching _a_night_at_the_roxbury_ , you'...   

                                       Preprocessed_Text  
0      films adapted comic books plenty success wheth...  
1      starters created alan moore eddie campbell bro...  
2      say moore campbell thoroughly researched subje...  
3      book `` graphic novel `` 500 pages l

Entraînement du modèle Word2Vec

In [22]:
from gensim.models import Word2Vec


tokenized_texts = [text.split() for text in df['Preprocessed_Text']]

# Paramètres du modèle Word2Vec
vector_size = 100  # Taille des vecteurs de mots
window = 5  # Taille de la fenêtre contextuelle
min_count = 1  # Ignorer les mots ayant une fréquence inférieure à ce seuil
sg = 0  # Utiliser l'algorithme Skip-gram (sg=1 pour CBOW)
epochs = 10  # Nombre d'itérations sur l'ensemble des données


model = Word2Vec(sentences=tokenized_texts,
                 vector_size=vector_size,
                 window=window,
                 min_count=min_count,
                 sg=sg,
                 epochs=epochs)


print("Vecteur du mot 'film':", model.wv['film'])

Vecteur du mot 'film': [-1.48878694e+00  3.40189397e-01 -2.80833364e-01 -8.27568054e-01
  4.15798098e-01 -3.71084750e-01  1.73919126e-01  1.21929193e+00
 -3.83469522e-01  7.15771854e-01 -1.45414436e+00 -7.57309318e-01
  7.68148303e-01  2.18357235e-01  1.05962694e+00 -1.20602405e+00
  7.07985044e-01 -8.04454647e-03 -5.59647977e-01 -1.78992021e+00
  7.77863503e-01 -8.48565042e-01  2.96196312e-01 -1.59255970e+00
 -1.20852746e-01  2.44949564e-01  1.87826559e-01  1.39504099e+00
 -9.47871029e-01  8.10221136e-01  1.76725781e+00 -3.31459641e-01
 -7.51761436e-01 -9.79740202e-01 -1.31908870e+00  1.73563138e-01
 -6.92288697e-01  1.33531049e-01 -4.32241172e-01 -2.53047466e-01
 -1.36244988e+00  5.27096748e-01 -1.65141809e+00  2.17090917e+00
 -4.47136045e-01 -3.54130536e-01  1.05105722e+00  2.46144589e-02
  4.42524761e-01 -1.18203796e-01  1.60176426e-01  5.90398386e-02
 -3.33753973e-01 -1.27642155e+00 -1.15117931e+00 -4.21303838e-01
 -1.21318316e+00 -1.15292788e+00 -1.63571680e+00 -1.02178192e+00
  

Vectorisation des reviews de movies

In [23]:
import numpy as np


def get_review_vector(review, model, vector_size):
    
    words = [word for word in review.split() if word in model.wv.key_to_index]
    if not words:
        return np.zeros(vector_size)
    word_vectors = [model.wv[word] for word in words]
    review_vector = np.mean(word_vectors, axis=0)
    return review_vector

df['Review_Vector'] = df['Preprocessed_Text'].apply(lambda x: get_review_vector(x, model, vector_size))

print(df[['Preprocessed_Text', 'Review_Vector']])


                                       Preprocessed_Text  \
0      films adapted comic books plenty success wheth...   
1      starters created alan moore eddie campbell bro...   
2      say moore campbell thoroughly researched subje...   
3      book `` graphic novel `` 500 pages long includ...   
4                          words n't dismiss film source   
...                                                  ...   
64715    lack inspiration traced back insipid characters   
64716  like many skits current incarnation _saturday_...   
64717  watching one `` roxbury `` skits snl come away...   
64718                         bump unsuspecting women 's   
64719  watching _a_night_at_the_roxbury_ 'll left exa...   

                                           Review_Vector  
0      [-0.49545103, 0.5904012, 0.38407725, 0.0540663...  
1      [-0.22736661, 0.46170366, 0.2058562, 0.5463130...  
2      [0.00074618176, 0.7641943, 0.6029967, 0.191977...  
3      [-0.4881899, 0.45736974, 0.5530963, 

Division des données

In [24]:
from sklearn.model_selection import train_test_split


train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)




Taille de l'ensemble d'entraînement : 51776
Taille de l'ensemble de test : 12944


Construction d'un classificateur

In [29]:
from sklearn.linear_model import LogisticRegression


X_train = np.vstack(train_data['Review_Vector'].values)
y_train = train_data['tag']
X_test = np.vstack(test_data['Review_Vector'].values)
y_test = test_data['tag']

logistic_model = LogisticRegression(max_iter=1000, random_state=42)


logistic_model.fit(X_train, y_train)


y_pred = logistic_model.predict(X_test)

print(y_pred)



['neg' 'pos' 'neg' ... 'pos' 'pos' 'pos']


Évaluation du modèle

In [28]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


y_pred = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)



precision = precision_score(y_test, y_pred, pos_label='pos')
recall = recall_score(y_test, y_pred, pos_label='pos')
f1 = f1_score(y_test, y_pred, pos_label='pos')

print("Accuracy :", accuracy)
print("Precision :", precision)
print("Recall :", recall)
print("F1-score :", f1)




Accuracy : 0.5857540173053152
Precision : 0.5830931796349664
Recall : 0.6464323748668797
F1-score : 0.6131313131313131
