In [7]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')                                   #required resources to be downloaded
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
import nltk
from nltk.corpus import movie_reviews
import random
import pandas as pd

nltk.download('movie_reviews')                        #downloads raw text reviews

documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.raw(fileid), category))                 #the main process of this block is to convert the english text into numerical representation

random.shuffle(documents)

data = pd.DataFrame(documents, columns=["review", "sentiment"])
data.head()


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


Unnamed: 0,review,sentiment
0,what i look for in a movie is not necessarily ...,pos
1,"barb wire , pamela anderson lee's first foray ...",neg
2,"cinematically speaking , gordon parks' origina...",pos
3,"the ads for "" batman and robin "" scream "" the ...",neg
4,the event of events is upon us . \npeople have...,pos


In [9]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])        #this is the major shift in the output,preprocessing data helps to improve accuracy
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

data['clean_review'] = data['review'].apply(preprocess)
data.head()



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review,sentiment,clean_review
0,what i look for in a movie is not necessarily ...,pos,look movie necessarily perfection sometimes mo...
1,"barb wire , pamela anderson lee's first foray ...",neg,barb wire pamela anderson lees first foray fil...
2,"cinematically speaking , gordon parks' origina...",pos,cinematically speaking gordon parks original 1...
3,"the ads for "" batman and robin "" scream "" the ...",neg,ads batman robin scream event summer thats pro...
4,the event of events is upon us . \npeople have...,pos,event events upon us people waited twentytwo y...


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer         #TF-IDF VECTORISATION

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['clean_review'])
y = data['sentiment']                                       #converts each review into vector format

print(X.shape)


(2000, 5000)


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [12]:
from sklearn.naive_bayes import MultinomialNB        #naive bayes model aligns well with text classification

model = MultinomialNB()
model.fit(X_train, y_train)


In [13]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test, y_pred))


Accuracy: 0.81
              precision    recall  f1-score   support

         neg       0.77      0.87      0.82       196
         pos       0.86      0.75      0.80       204

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.81       400
weighted avg       0.82      0.81      0.81       400

