In [5]:

import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

#nltk.download('stopwords')


stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

df = pd.read_csv(r"D:\College\FDS Experiments\FDS_Exp_4\IMDB-Dataset.csv")

print("Columns in the dataset:", df.columns)


def preprocess_text_v2(text, stem=False):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphab
    words = text.split()
    if stem:
        
        words = [stemmer.stem(word) for word in words if word not in stop_words]
    else:
        words = [word for word in words if word not in stop_words]
    return " ".join(words)

#preproc review col
df['processed_text'] = df['review'].apply(preprocess_text_v2)

#sentiment labels to 0,1
df['sentiment'] = df['sentiment'].replace({'positive': 1, 'negative': 0})

#Check labels converted or not
print(df[['review', 'processed_text', 'sentiment']].head())

#Vectorize the text data (convert text to TF-IDF features)
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['processed_text'])

y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
#Model perf on testdata
y_pred = nb_model.predict(X_test)

#accuarcy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Columns in the dataset: Index(['review', 'sentiment'], dtype='object')


  df['sentiment'] = df['sentiment'].replace({'positive': 1, 'negative': 0})


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      processed_text  sentiment  
0  one reviewers mentioned watching oz episode yo...          1  
1  wonderful little production br br filming tech...          1  
2  thought wonderful way spend time hot summer we...          1  
3  basically theres family little boy jake thinks...          0  
4  petter matteis love time money visually stunni...          1  
Accuracy: 84.96%

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      4961
           1       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro

In [6]:
new_review = input("Give a random review about a movie (e.g 'Movie was great'): ")

# Preproc new rev
processed_review = preprocess_text_v2(new_review, stem=False)
#con new rev to tf-idf
X_new = vectorizer.transform([processed_review])
#pred sent of new rev
prediction = nb_model.predict(X_new)
sentiment = "Positive" if prediction == 1 else "Negative"
print(f"Review: {new_review}\nSentiment: {sentiment}")


Give a random review about a movie (e.g 'Movie was great'):  Mvie was bad


Review: Mvie was bad
Sentiment: Negative


In [8]:
processed_review = preprocess_text_v2(new_review)
print("Processed review:", processed_review)


Processed review: mvie bad
