<a href="https://colab.research.google.com/github/humayun-mhk/Internship-AI-NLP-Projects-/blob/main/Movie_Review_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# importing libraries
import pandas as pd
import numpy as np
import nltk
import re
import nltk
nltk.download('punkt_tab')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Load dataset
df = pd.read_csv("/content/IMDb Movie Reviews Dataset.csv", encoding="latin1", on_bad_lines="skip")

# Encode sentiment labels
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Drop unwanted columns if they exist
if "Unnamed: 0" in df.columns:
    df.drop("Unnamed: 0", axis=1, inplace=True)

# Download necessary NLTK data
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Preprocessing steps
df["review"] = df["review"].astype(str).fillna("").str.lower()

# Tokenization
df["tokenized_words"] = df["review"].apply(word_tokenize)

# Remove stopwords
stop_words = set(stopwords.words("english"))
df["filtered_review"] = df["tokenized_words"].apply(lambda x: [word for word in x if word.isalpha() and word not in stop_words])

# Stemming
stemmer = PorterStemmer()
df["stemmed_review"] = df["filtered_review"].apply(lambda x: [stemmer.stem(word) for word in x])

# Lemmatization
lemmatizer = WordNetLemmatizer()
df["lemmatized_review"] = df["stemmed_review"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Join words back into sentences
df["cleaned_review"] = df["lemmatized_review"].apply(lambda x: " ".join(x))

# Feature Extraction
vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer(strip_accents="unicode", stop_words="english")

text_data = df["cleaned_review"]

vectorized_data = vectorizer.fit_transform(text_data)
tfidf_vectorized_data = tfidf_vectorizer.fit_transform(text_data)

# Train-Test Split
X = vectorized_data
Y = df["sentiment"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Logistic Regression Model
log_model = LogisticRegression(max_iter=500)
log_model.fit(X_train, Y_train)
log_pred = log_model.predict(X_test)

print("Logistic Regression - Accuracy:", accuracy_score(Y_test, log_pred))
print("Logistic Regression - F1 Score:", f1_score(Y_test, log_pred))

# Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train, Y_train)
nb_prediction = nb_model.predict(X_test)

print("Naive Bayes - Accuracy:", accuracy_score(Y_test, nb_prediction))
print("Naive Bayes - F1 Score:", f1_score(Y_test, nb_prediction))

# Function to preprocess user input
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join([lemmatizer.lemmatize(word) for word in tokens])

# Function to predict sentiment from user input
def predict_sentiment(review):
    review_cleaned = clean_text(review)
    review_vectorized = vectorizer.transform([review_cleaned])
    prediction = log_model.predict(review_vectorized)[0]
    return "Positive" if prediction == 1 else "Negative"

# Test the function with user input
user_review1 = "This movie was absolutely fantastic!"
user_review2 = "I regret watching this film, total waste of time."

print("User Review 1 Sentiment:", predict_sentiment(user_review1))  # Expected: Positive
print("User Review 2 Sentiment:", predict_sentiment(user_review2))  # Expected: Negative


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Logistic Regression - Accuracy: 0.8794
Logistic Regression - F1 Score: 0.8814159292035398
Naive Bayes - Accuracy: 0.8531
Naive Bayes - F1 Score: 0.8518406454866364
User Review 1 Sentiment: Positive
User Review 2 Sentiment: Positive
