<a href="https://colab.research.google.com/github/krish269/movie-review-sentiment-analysis-using-nlp/blob/main/nlppro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import sys

def download_nltk_data():
    """
    Downloads all necessary NLTK data packs.
    The downloader will skip any packages that are already up-to-date.
    """
    try:
        print("Ensuring NLTK data is available...")
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        nltk.download('wordnet', quiet=True)
        nltk.download('punkt_tab', quiet=True) # Download punkt_tab
        print("NLTK data is ready.")
    except Exception as e:
        print(f"An error occurred during NLTK data download: {e}", file=sys.stderr)
        print("Please check your internet connection and try again.", file=sys.stderr)
        sys.exit(1)


def load_data(filepath):
    """
    Loads the movie review dataset from a CSV file.
    Validates the presence of 'review' and 'sentiment' columns.
    """
    print(f"\nLoading data from {filepath}...")
    try:
        df = pd.read_csv(filepath, encoding='utf-8')
        print("Data loaded successfully.")

        required_columns = ['review', 'sentiment']
        if not all(col in df.columns for col in required_columns):
            print(f"Error: The CSV file must contain the columns: {required_columns}", file=sys.stderr)
            return None

        if df.empty:
            print("Error: The CSV file is empty.", file=sys.stderr)
            return None

        print("Dataset preview:")
        print(df.head())
        print("\nDataset info:")
        df.info()
        return df
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.", file=sys.stderr)
        print("Please make sure the 'IMDB Dataset.csv' file is in the same directory as the script.", file=sys.stderr)
        return None
    except Exception as e:
        print(f"An error occurred while reading the file: {e}", file=sys.stderr)
        return None

def preprocess_text(text):

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    text = re.sub(re.compile('<.*?>'), '', text)

    text = re.sub('[^a-zA-Z]', ' ', text)

    text = text.lower()

    tokens = nltk.word_tokenize(text)

    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1]

    return " ".join(cleaned_tokens)

def create_tfidf_vectorizer(corpus):
    """
    Creates and fits a TF-IDF vectorizer on the given text corpus.
    """
    print("\nCreating TF-IDF features...")
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(corpus).toarray()
    print("TF-IDF features created.")
    return X, vectorizer

def train_model(X_train, y_train):
    """
    Trains a Logistic Regression model.
    """
    print("\nTraining the model...")

    model = LogisticRegression(random_state=42, max_iter=1000)
    model.fit(X_train, y_train)
    print("Model training complete.")
    return model

def evaluate_model(model, X_test, y_test):
    """
    Evaluates the model's performance and prints a report.
    """
    print("\nEvaluating the model...")
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

def predict_sentiment(review, vectorizer, model):

    processed_review = preprocess_text(review)

    review_vector = vectorizer.transform([processed_review]).toarray()

    prediction = model.predict(review_vector)

    sentiment = 'Positive' if prediction[0] == 1 else 'Negative'

    print(f"Predicted Sentiment: {sentiment.upper()}")
    return sentiment

if __name__ == '__main__':

    download_nltk_data()

    df = load_data('IMDB Dataset.csv')

    if df is None:
        sys.exit(1)

    print("\nPreprocessing all reviews in the dataset (this may take a while)...")
    df['cleaned_review'] = df['review'].apply(preprocess_text)
    print("Preprocessing complete.")
    print("Dataset with cleaned reviews:")
    print(df.head())

    X_features, tfidf_vectorizer = create_tfidf_vectorizer(df['cleaned_review'])
    y_labels = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

    X_train, X_test, y_train, y_test = train_test_split(
        X_features,
        y_labels,
        test_size=0.2,
        random_state=42,
        stratify=y_labels
    )
    print(f"\nData split into training and testing sets.")
    print(f"Training set shape: {X_train.shape}")
    print(f"Testing set shape: {X_test.shape}")


    sentiment_model = train_model(X_train, y_train)

    evaluate_model(sentiment_model, X_test, y_test)

    print("\nSaving model and vectorizer to disk...")
    joblib.dump(sentiment_model, 'sentiment_model.pkl')
    joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
    print("Files saved successfully.")

    print("\n--- Interactive Sentiment Prediction ---")
    print("Enter a movie review to predict its sentiment.")
    print("Type 'quit' or 'exit' to stop.")

    while True:
        user_review = input("\nEnter your review: ")
        if user_review.lower() in ['quit', 'exit']:
            print("Exiting interactive prediction.")
            break

        if not user_review.strip():
            print("Please enter a review.")
            continue

        predict_sentiment(
            user_review,
            tfidf_vectorizer,
            sentiment_model
        )

Ensuring NLTK data is available...
NLTK data is ready.

Loading data from IMDB Dataset.csv...
Data loaded successfully.
Dataset preview:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB

Preprocessing all reviews in the dataset (this may take a while)...
Preprocessing complete.
Dataset with cleaned reviews:
                                         

In [None]:

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib
import sys

def download_nltk_data():
    """
    Downloads all necessary NLTK data packs.
    """
    try:
        print("Ensuring NLTK data is available...")
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        nltk.download('wordnet', quiet=True)
        print("NLTK data is ready.")
    except Exception as e:
        print(f"An error occurred during NLTK data download: {e}", file=sys.stderr)
        sys.exit(1)

def load_data(filepath):
    """
    Loads the movie review dataset from a CSV file.
    """
    print(f"\nLoading data from {filepath}...")
    try:
        df = pd.read_csv(filepath, encoding='utf-8')
        print("Data loaded successfully.")
        if 'review' not in df.columns or 'sentiment' not in df.columns:
            print("Error: CSV must contain 'review' and 'sentiment' columns.", file=sys.stderr)
            return None
        return df
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.", file=sys.stderr)
        return None

def preprocess_text(text):
    """
    Cleans and preprocesses a single piece of text.
    """
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = re.sub(re.compile('<.*?>'), '', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1]
    return " ".join(cleaned_tokens)

if __name__ == '__main__':
    download_nltk_data()

    df = load_data('IMDB Dataset.csv')
    if df is None:
        sys.exit(1)

    print("\nPreprocessing all reviews (this may take a while)...")
    df['cleaned_review'] = df['review'].apply(preprocess_text)
    print("Preprocessing complete.")

    print("\nCreating TF-IDF features...")
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    X_features = tfidf_vectorizer.fit_transform(df['cleaned_review']).toarray()
    y_labels = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
    print("TF-IDF features created.")

    X_train, X_test, y_train, y_test = train_test_split(
        X_features, y_labels, test_size=0.2, random_state=42, stratify=y_labels
    )
    print("\nData split into training and testing sets.")

    print("\nTraining the model...")
    model = LogisticRegression(random_state=42, max_iter=1000)
    model.fit(X_train, y_train)
    print("Model training complete.")

    print("\nEvaluating the model...")
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

    print("\nSaving model and vectorizer to disk...")
    joblib.dump(model, 'sentiment_model.pkl')
    joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
    print("Files 'sentiment_model.pkl' and 'tfidf_vectorizer.pkl' saved successfully.")
    print("\n--- Training complete. You can now run the prediction script. ---")


Ensuring NLTK data is available...
NLTK data is ready.

Loading data from IMDB Dataset.csv...
Data loaded successfully.

Preprocessing all reviews (this may take a while)...
Preprocessing complete.

Creating TF-IDF features...
TF-IDF features created.

Data split into training and testing sets.

Training the model...
Model training complete.

Evaluating the model...
Model Accuracy: 0.8909

Classification Report:
              precision    recall  f1-score   support

    Negative       0.90      0.88      0.89      5000
    Positive       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000


Saving model and vectorizer to disk...
Files 'sentiment_model.pkl' and 'tfidf_vectorizer.pkl' saved successfully.

--- Training complete. You can now run the prediction script. ---


In [None]:
# BLOCK 2: Load Model and Predict
# This script loads the pre-trained model and vectorizer and provides an
# interactive prompt to predict the sentiment of new movie reviews.
# Run this script anytime after the training script has been executed successfully.

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import joblib
import sys

# --- 1. NLTK Data Download (needed for preprocessing) ---
def download_nltk_data():
    """
    Ensures NLTK data is available for the preprocessing function.
    """
    try:
        nltk.data.find('tokenizers/punkt')
        nltk.data.find('corpora/stopwords')
        nltk.data.find('corpora/wordnet')
    except LookupError:
        print("Downloading necessary NLTK data...")
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        nltk.download('wordnet', quiet=True)

# --- 2. Text Preprocessing Function ---
# This function MUST be identical to the one used during training.
def preprocess_text(text):
    """
    Cleans and preprocesses a single piece of text.
    """
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = re.sub(re.compile('<.*?>'), '', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1]
    return " ".join(cleaned_tokens)

# --- 3. Prediction Pipeline ---
def predict_sentiment(review, vectorizer, model):
    """
    Takes a single movie review and predicts its sentiment.
    """
    processed_review = preprocess_text(review)
    review_vector = vectorizer.transform([processed_review]).toarray()
    prediction = model.predict(review_vector)
    sentiment = 'Positive' if prediction[0] == 1 else 'Negative'
    print(f"Predicted Sentiment: {sentiment.upper()}")

# --- Main Prediction Execution ---
if __name__ == '__main__':
    # Ensure NLTK data is ready
    download_nltk_data()

    # Load the saved model and vectorizer
    try:
        print("Loading trained model and vectorizer...")
        model = joblib.load('sentiment_model.pkl')
        vectorizer = joblib.load('tfidf_vectorizer.pkl')
        print("Model and vectorizer loaded successfully.")
    except FileNotFoundError:
        print("\nError: Model or vectorizer files not found.", file=sys.stderr)
        print("Please run the 'train_model.py' script first to train and save the model.", file=sys.stderr)
        sys.exit(1)

    # Interactive prediction loop
    print("\n--- Interactive Sentiment Prediction ---")
    print("Enter a movie review to predict its sentiment.")
    print("Type 'quit' or 'exit' to stop.")

    while True:
        user_review = input("\nEnter your review: ")
        if user_review.lower() in ['quit', 'exit']:
            print("Exiting interactive prediction.")
            break

        if not user_review.strip():
            print("Please enter a review.")
            continue

        predict_sentiment(user_review, vectorizer, model)


Downloading necessary NLTK data...
Loading trained model and vectorizer...
Model and vectorizer loaded successfully.

--- Interactive Sentiment Prediction ---
Enter a movie review to predict its sentiment.
Type 'quit' or 'exit' to stop.

Enter your review: nice
Predicted Sentiment: POSITIVE

Enter your review: quit
Exiting interactive prediction.
