In [None]:
import nltk
from nltk.corpus import movie_reviews

# Download the movie reviews dataset
nltk.download('movie_reviews')

# Load the dataset
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

# Print the total number of documents
print('Total number of documents:', len(documents))

# Print a sample document
print('\nSample document:', documents[0])

In [None]:
!pip install -q nltk

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Function to preprocess text
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Lowercase and remove punctuation
    tokens = [token.lower() for token in tokens if token.isalpha()]

    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Stemming
    tokens = [stemmer.stem(token) for token in tokens]

    return tokens

# Preprocess all documents
documents = [(preprocess_text(' '.join(doc)), category) for doc, category in documents]

# Print a sample preprocessed document
print('Sample preprocessed document:', documents[0])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Join the words back into one string separated by space,
# and create a list of sentences.
sentences = [' '.join(doc) for doc, category in documents]
categories = [category for doc, category in documents]

# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Convert sentences into vectors
X = vectorizer.fit_transform(sentences)

# Map categories to binary labels
y = [1 if category == 'pos' else 0 for category in categories]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', len(y_train))
print('y_test shape:', len(y_test))

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Create an instance of MultinomialNB
classifier = MultinomialNB()

# Train the classifier
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Compute the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)

print('Accuracy of the Naive Bayes classifier:', accuracy)