***************            Pre-processing des données textuelles :*******

In [34]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [35]:

# Téléchargement des stopwords et du tokenizer
nltk.download('stopwords')
nltk.download('punkt')

# Chargement des données
data = pd.read_csv('movie_review.csv')  # Assurez-vous que le fichier CSV est dans votre répertoire de travail

# Prétraitement des données textuelles
stop_words = set(stopwords.words('english'))
print("\ndata :\n",data)






data :
        fold_id cv_tag  html_id  sent_id  \
0            0  cv000    29590        0   
1            0  cv000    29590        1   
2            0  cv000    29590        2   
3            0  cv000    29590        3   
4            0  cv000    29590        4   
...        ...    ...      ...      ...   
64715        9  cv999    14636       20   
64716        9  cv999    14636       21   
64717        9  cv999    14636       22   
64718        9  cv999    14636       23   
64719        9  cv999    14636       24   

                                                    text  tag  
0      films adapted from comic books have had plenty...  pos  
1      for starters , it was created by alan moore ( ...  pos  
2      to say moore and campbell thoroughly researche...  pos  
3      the book ( or " graphic novel , " if you will ...  pos  
4      in other words , don't dismiss this film becau...  pos  
...                                                  ...  ...  
64715  that lack of inspir

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [36]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Conversion en minuscules et suppression de la ponctuation
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # Suppression des stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Appliquer le prétraitement aux données textuelles
data['text'] = data['text'].apply(preprocess_text)

print("text apres pretraitement  \n\n\n ",data['text'] )

text apres pretraitement  


  0        [films, adapted, comic, books, plenty, success...
1        [starters, created, alan, moore, eddie, campbe...
2        [say, moore, campbell, thoroughly, researched,...
3        [book, graphic, novel, pages, long, includes, ...
4                           [words, dismiss, film, source]
                               ...                        
64715    [lack, inspiration, traced, back, insipid, cha...
64716    [like, many, skits, current, incarnation, roxb...
64717    [watching, one, roxbury, skits, snl, come, awa...
64718                          [bump, unsuspecting, women]
64719                            [watching, left, exactly]
Name: text, Length: 64720, dtype: object


Entraînement du modèle Word2Vec :

In [40]:
# Entraînement du modèle Word2Vec
model = Word2Vec(sentences=data['text'], vector_size=100, window=5, min_count=1, workers=4)
print('\n',model,'\n\n')


 Word2Vec<vocab=37964, vector_size=100, alpha=0.025> 




Vectorisation des reviews de movies :

In [32]:

# Vectorisation des reviews de movies
def vectorize_text(text):
    vectors = []
    for word in text:
        if word in model.wv:
            vectors.append(model.wv[word])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(100)  # Si aucun mot n'est présent dans le modèle, retourne un vecteur de zéros

data['Vector'] = data['text'].apply(vectorize_text)
print(data['Vector'])


0        [-0.23907714, 0.4008218, 0.4034046, -0.0273781...
1        [-0.119350985, 0.3264918, 0.24850626, -0.23630...
2        [-0.25180298, 0.38039798, 0.47546935, -0.08274...
3        [-0.17594603, 0.4400795, 0.22700986, 0.0237775...
4        [-0.21982016, 0.5243043, 0.27953598, 0.2695864...
                               ...                        
64715    [-0.17481302, 0.37867293, 0.25818866, 0.015624...
64716    [-0.20659553, 0.3641056, 0.33478892, 0.0186143...
64717    [-0.25206664, 0.44335622, 0.3013041, -0.096540...
64718    [-0.13628499, 0.30674675, 0.21505015, -0.15492...
64719    [-0.35564566, 0.49446023, 0.49216613, 0.155072...
Name: Vector, Length: 64720, dtype: object


Division des données :

In [None]:
# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(np.vstack(data['Vector']), data['tag'], test_size=0.2, random_state=42)



Construction d&#39;un classificateur :

In [None]:
# Construction d'un classificateur (exemple avec Logistic Regression)
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)


Évaluation du modèle :

In [33]:
# Évaluation du modèle
y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.5696075401730532
Precision: 0.5707504960409738
Recall: 0.5696075401730532
F1 Score: 0.5653499581270917
