In [23]:
pip install nltk



In [12]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import movie_reviews
import random


In [13]:
# Загрузите и прочитайте набор данных рецензий на фильмы IMDb
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [14]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)


In [15]:

# Преобразование документов в DataFrame
reviews = [' '.join(words) for words, category in documents]
sentiments = [category for words, category in documents]
data = pd.DataFrame({'review': reviews, 'sentiment': sentiments})

In [16]:
# Просмотр информации о наборе данных
print(data.head())
print(data.info())

                                              review sentiment
0  chill factor is a carbon copy of speed with on...       neg
1  in the year 2029 , captain leo davidson ( mark...       neg
2  gord brody ( tom green ) is an aspiring animat...       neg
3  it was once said that in order to truly enjoy ...       neg
4  synopsis : captain picard and the crew of the ...       pos
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     2000 non-null   object
 1   sentiment  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB
None


In [24]:
# Предварительная обработка данных
X = data['review']
y = data['sentiment'].map({'pos': 1, 'neg': 0})  # 将情感标签转换为二分类

In [25]:
# Разделите обучающее и тестовое множество
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# CountVectorizer
count_vect = CountVectorizer(stop_words='english')
X_train_count = count_vect.fit_transform(X_train)
X_test_count = count_vect.transform(X_test)

In [26]:
# TfidfVectorizer
tfidf_vect = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [21]:
models = {
    'LinearSVC (CountVectorizer)': LinearSVC(),
    'LogisticRegression (CountVectorizer)': LogisticRegression(max_iter=1000),
    'LinearSVC (TfidfVectorizer)': LinearSVC(),
    'LogisticRegression (TfidfVectorizer)': LogisticRegression(max_iter=1000)
}

In [22]:
# Модели обучения и оценки
for name, model in models.items():
    if 'CountVectorizer' in name:
        model.fit(X_train_count, y_train)
        y_pred = model.predict(X_test_count)
    else:
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_test_tfidf)

    print(f"Results for {name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print("="*60)

Results for LinearSVC (CountVectorizer):
Accuracy: 0.845
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       209
           1       0.84      0.84      0.84       191

    accuracy                           0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.84      0.84      0.84       400

Results for LogisticRegression (CountVectorizer):
Accuracy: 0.85
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       209
           1       0.84      0.84      0.84       191

    accuracy                           0.85       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.85      0.85      0.85       400

Results for LinearSVC (TfidfVectorizer):
Accuracy: 0.855
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       209
           1       0.84      0.85      0.85       191

   