In [4]:
import nltk
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Download the "punkt" and "wordnet" resources
nltk.download('punkt')
nltk.download('wordnet')

# Load the movie reviews dataset from NLTK
nltk.download('movie_reviews')

positive_reviews = [(movie_reviews.words(fileid), 'pos') for fileid in movie_reviews.fileids('pos')]
negative_reviews = [(movie_reviews.words(fileid), 'neg') for fileid in movie_reviews.fileids('neg')]
all_reviews = positive_reviews + negative_reviews

# Preprocess the data
lemmatizer = WordNetLemmatizer()
X = [' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(' '.join(review))]) for (review, label) in all_reviews]
y = [label for (review, label) in all_reviews]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test_vec)
print(y_pred)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


['neg' 'pos' 'pos' 'pos' 'neg' 'pos' 'pos' 'pos' 'pos' 'neg' 'pos' 'neg'
 'neg' 'pos' 'neg' 'pos' 'neg' 'pos' 'pos' 'neg' 'pos' 'neg' 'neg' 'neg'
 'neg' 'pos' 'neg' 'neg' 'neg' 'pos' 'pos' 'pos' 'pos' 'pos' 'neg' 'pos'
 'neg' 'neg' 'neg' 'neg' 'pos' 'neg' 'pos' 'pos' 'pos' 'pos' 'neg' 'neg'
 'pos' 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'pos'
 'neg' 'pos' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg' 'pos' 'pos' 'neg' 'neg'
 'neg' 'neg' 'pos' 'pos' 'neg' 'pos' 'pos' 'neg' 'pos' 'pos' 'neg' 'neg'
 'neg' 'neg' 'pos' 'pos' 'pos' 'neg' 'neg' 'neg' 'pos' 'pos' 'neg' 'neg'
 'neg' 'pos' 'neg' 'pos' 'pos' 'pos' 'pos' 'neg' 'pos' 'neg' 'pos' 'pos'
 'neg' 'pos' 'pos' 'neg' 'pos' 'neg' 'pos' 'pos' 'neg' 'pos' 'pos' 'pos'
 'neg' 'neg' 'neg' 'pos' 'pos' 'neg' 'neg' 'neg' 'pos' 'pos' 'neg' 'neg'
 'neg' 'pos' 'pos' 'pos' 'pos' 'neg' 'neg' 'neg' 'neg' 'pos' 'neg' 'pos'
 'neg' 'pos' 'pos' 'neg' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'neg' 'pos'
 'pos' 'neg' 'pos' 'neg' 'pos' 'pos' 'pos' 'neg' 'n