In [None]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [None]:
from nltk.corpus import movie_reviews

In [None]:
documents = [(" ".join(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [None]:
movie_reviews.categories()

['neg', 'pos']

In [None]:
# Mélanger les documents pour une meilleure répartition des classes
import random
random.shuffle(documents)


In [None]:
train_set, test_set = train_test_split(documents, test_size=0.2, random_state=42)

In [None]:
len(train_set)

1600

In [None]:
# Extraction des features avec TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform([x[0] for x in train_set])
X_test_tfidf = tfidf_vectorizer.transform([x[0] for x in test_set])

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
rf_classifier.fit(X_train_tfidf, [category for words, category in train_set])

In [None]:
predictions = rf_classifier.predict(X_test_tfidf)

In [None]:
accuracy = accuracy_score([category for words, category in test_set], predictions)

In [None]:
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.81


In [None]:
print('\nClassification Report:\n', classification_report([category for words, category in test_set], predictions))



Classification Report:
               precision    recall  f1-score   support

         neg       0.76      0.89      0.82       200
         pos       0.86      0.72      0.79       200

    accuracy                           0.81       400
   macro avg       0.81      0.80      0.80       400
weighted avg       0.81      0.81      0.80       400



In [None]:
from xgboost import XGBClassifier


In [None]:
xgb_classifier = XGBClassifier(learning_rate=0.1, n_estimators=100, random_state=42)

In [None]:
label_mapping = {label: i for i, label in enumerate(set(category for words, category in train_set))}
y_train = [label_mapping[category] for words, category in train_set]
y_test = [label_mapping[category] for words, category in test_set]

In [None]:
xgb_classifier.fit(X_train_tfidf, y_train)

In [None]:
predictions = xgb_classifier.predict(X_test_tfidf)

In [None]:
# Évaluation du modèle
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')

# Affichage du rapport de classification
print('\nClassification Report:\n', classification_report(y_test, predictions))

Accuracy: 0.83

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.84      0.83       200
           1       0.84      0.81      0.83       200

    accuracy                           0.83       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.83      0.83      0.83       400

