In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')                                   #required resources to be downloaded
nltk.download('stopwords')


In [None]:
import nltk
from nltk.corpus import movie_reviews
import random
import pandas as pd

nltk.download('movie_reviews')                        #downloads raw text reviews

documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.raw(fileid), category))                 #the main process of this block is to convert the english text into numerical representation

random.shuffle(documents)

data = pd.DataFrame(documents, columns=["review", "sentiment"])
data.head()


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Unnamed: 0,review,sentiment
0,"while it was part of former yugoslavia , my co...",pos
1,""" desperate measures "" is a generic title for...",neg
2,i guess that if a very wild bachelor party had...,neg
3,notting hill's trailer is awful : a laughless ...,pos
4,"post-chasing amy , a slew of love-triangle mov...",neg


In [None]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

data['clean_review'] = data['review'].apply(preprocess)
data.head()



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review,sentiment,clean_review
0,"while it was part of former yugoslavia , my co...",pos,part former yugoslavia country croatia used al...
1,""" desperate measures "" is a generic title for...",neg,desperate measures generic title film thats be...
2,i guess that if a very wild bachelor party had...,neg,guess wild bachelor party gone really bad woul...
3,notting hill's trailer is awful : a laughless ...,pos,notting hills trailer awful laughless schmaltz...
4,"post-chasing amy , a slew of love-triangle mov...",neg,postchasing amy slew lovetriangle movies month...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['clean_review'])
y = data['sentiment']

print(X.shape)


(2000, 5000)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test, y_pred))


Accuracy: 0.8225
              precision    recall  f1-score   support

         neg       0.84      0.82      0.83       207
         pos       0.81      0.83      0.82       193

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400

