In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Téléchargement des stopwords de NLTK
nltk.download('stopwords')
nltk.download('punkt')

# Montage de Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Chargement des données
data = pd.read_csv("/content/drive/My Drive/movie_review.csv")

print(data.head())

# Prétraitement des données textuelles
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convertir en minuscules
    text = text.lower()
    # Supprimer la ponctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenization
    tokens = word_tokenize(text)
    # Supprimer les stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

data['processed_text'] = data['text'].apply(preprocess_text)

# Entraînement du modèle Word2Vec
word2vec_model = Word2Vec(sentences=data['processed_text'], vector_size=100, window=5, min_count=1, workers=4)

# Vectorisation des reviews de films
def vectorize_text(text):
    vectors = []
    for word in text:
        if word in word2vec_model.wv:
            vectors.append(word2vec_model.wv[word])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)

data['vectorized_text'] = data['processed_text'].apply(vectorize_text)

# Division des données
X = np.array(data['vectorized_text'].to_list())
y = np.array(data['tag'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Construction d'un classificateur
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Évaluation du modèle
y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   fold_id cv_tag  html_id  sent_id  \
0        0  cv000    29590        0   
1        0  cv000    29590        1   
2        0  cv000    29590        2   
3        0  cv000    29590        3   
4        0  cv000    29590        4   

                                                text  tag  
0  films adapted from comic books have had plenty...  pos  
1  for starters , it was created by alan moore ( ...  pos  
2  to say moore and campbell thoroughly researche...  pos  
3  the book ( or " graphic novel , " if you will ...  pos  
4  in other words , don't dismiss this film becau...  pos  
