<a href="https://colab.research.google.com/github/krish269/movie-review-sentiment-analysis-using-nlp/blob/main/nlppro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import sys

def download_nltk_data():
    """
    Downloads all necessary NLTK data packs.
    The downloader will skip any packages that are already up-to-date.
    """
    try:
        print("Ensuring NLTK data is available...")
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        nltk.download('wordnet', quiet=True)
        nltk.download('punkt_tab', quiet=True) # Download punkt_tab
        print("NLTK data is ready.")
    except Exception as e:
        print(f"An error occurred during NLTK data download: {e}", file=sys.stderr)
        print("Please check your internet connection and try again.", file=sys.stderr)
        sys.exit(1)


def load_data(filepath):
    """
    Loads the movie review dataset from a CSV file.
    Validates the presence of 'review' and 'sentiment' columns.
    """
    print(f"\nLoading data from {filepath}...")
    try:
        df = pd.read_csv(filepath, encoding='utf-8')
        print("Data loaded successfully.")

        required_columns = ['review', 'sentiment']
        if not all(col in df.columns for col in required_columns):
            print(f"Error: The CSV file must contain the columns: {required_columns}", file=sys.stderr)
            return None

        if df.empty:
            print("Error: The CSV file is empty.", file=sys.stderr)
            return None

        print("Dataset preview:")
        print(df.head())
        print("\nDataset info:")
        df.info()
        return df
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.", file=sys.stderr)
        print("Please make sure the 'IMDB Dataset.csv' file is in the same directory as the script.", file=sys.stderr)
        return None
    except Exception as e:
        print(f"An error occurred while reading the file: {e}", file=sys.stderr)
        return None

def preprocess_text(text):

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    text = re.sub(re.compile('<.*?>'), '', text)

    text = re.sub('[^a-zA-Z]', ' ', text)

    text = text.lower()

    tokens = nltk.word_tokenize(text)

    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1]

    return " ".join(cleaned_tokens)

def create_tfidf_vectorizer(corpus):
    """
    Creates and fits a TF-IDF vectorizer on the given text corpus.
    """
    print("\nCreating TF-IDF features...")
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(corpus).toarray()
    print("TF-IDF features created.")
    return X, vectorizer

def train_model(X_train, y_train):
    """
    Trains a Logistic Regression model.
    """
    print("\nTraining the model...")

    model = LogisticRegression(random_state=42, max_iter=1000)
    model.fit(X_train, y_train)
    print("Model training complete.")
    return model

def evaluate_model(model, X_test, y_test):
    """
    Evaluates the model's performance and prints a report.
    """
    print("\nEvaluating the model...")
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

def predict_sentiment(review, vectorizer, model):

    processed_review = preprocess_text(review)

    review_vector = vectorizer.transform([processed_review]).toarray()

    prediction = model.predict(review_vector)

    sentiment = 'Positive' if prediction[0] == 1 else 'Negative'

    print(f"Predicted Sentiment: {sentiment.upper()}")
    return sentiment

if __name__ == '__main__':

    download_nltk_data()

    df = load_data('IMDB Dataset.csv')

    if df is None:
        sys.exit(1)

    print("\nPreprocessing all reviews in the dataset (this may take a while)...")
    df['cleaned_review'] = df['review'].apply(preprocess_text)
    print("Preprocessing complete.")
    print("Dataset with cleaned reviews:")
    print(df.head())

    X_features, tfidf_vectorizer = create_tfidf_vectorizer(df['cleaned_review'])
    y_labels = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

    X_train, X_test, y_train, y_test = train_test_split(
        X_features,
        y_labels,
        test_size=0.2,
        random_state=42,
        stratify=y_labels
    )
    print(f"\nData split into training and testing sets.")
    print(f"Training set shape: {X_train.shape}")
    print(f"Testing set shape: {X_test.shape}")


    sentiment_model = train_model(X_train, y_train)

    evaluate_model(sentiment_model, X_test, y_test)

    print("\nSaving model and vectorizer to disk...")
    joblib.dump(sentiment_model, 'sentiment_model.pkl')
    joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
    print("Files saved successfully.")

    print("\n--- Interactive Sentiment Prediction ---")
    print("Enter a movie review to predict its sentiment.")
    print("Type 'quit' or 'exit' to stop.")

    while True:
        user_review = input("\nEnter your review: ")
        if user_review.lower() in ['quit', 'exit']:
            print("Exiting interactive prediction.")
            break

        if not user_review.strip():
            print("Please enter a review.")
            continue

        predict_sentiment(
            user_review,
            tfidf_vectorizer,
            sentiment_model
        )

Ensuring NLTK data is available...
NLTK data is ready.

Loading data from IMDB Dataset.csv...
Data loaded successfully.
Dataset preview:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB

Preprocessing all reviews in the dataset (this may take a while)...
Preprocessing complete.
Dataset with cleaned reviews:
                                         

In [3]:

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib
import sys

def download_nltk_data():
    """
    Downloads all necessary NLTK data packs.
    """
    try:
        print("Ensuring NLTK data is available...")
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        nltk.download('wordnet', quiet=True)
        print("NLTK data is ready.")
    except Exception as e:
        print(f"An error occurred during NLTK data download: {e}", file=sys.stderr)
        sys.exit(1)

def load_data(filepath):
    """
    Loads the movie review dataset from a CSV file.
    """
    print(f"\nLoading data from {filepath}...")
    try:
        df = pd.read_csv(filepath, encoding='utf-8')
        print("Data loaded successfully.")
        if 'review' not in df.columns or 'sentiment' not in df.columns:
            print("Error: CSV must contain 'review' and 'sentiment' columns.", file=sys.stderr)
            return None
        return df
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.", file=sys.stderr)
        return None

def preprocess_text(text):
    """
    Cleans and preprocesses a single piece of text.
    """
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = re.sub(re.compile('<.*?>'), '', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1]
    return " ".join(cleaned_tokens)

if __name__ == '__main__':
    download_nltk_data()

    df = load_data('IMDB Dataset.csv')
    if df is None:
        sys.exit(1)

    print("\nPreprocessing all reviews (this may take a while)...")
    df['cleaned_review'] = df['review'].apply(preprocess_text)
    print("Preprocessing complete.")

    print("\nCreating TF-IDF features...")
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    X_features = tfidf_vectorizer.fit_transform(df['cleaned_review']).toarray()
    y_labels = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
    print("TF-IDF features created.")

    X_train, X_test, y_train, y_test = train_test_split(
        X_features, y_labels, test_size=0.2, random_state=42, stratify=y_labels
    )
    print("\nData split into training and testing sets.")

    print("\nTraining the model...")
    model = LogisticRegression(random_state=42, max_iter=1000)
    model.fit(X_train, y_train)
    print("Model training complete.")

    print("\nEvaluating the model...")
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

    print("\nSaving model and vectorizer to disk...")
    joblib.dump(model, 'sentiment_model.pkl')
    joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
    print("Files 'sentiment_model.pkl' and 'tfidf_vectorizer.pkl' saved successfully.")
    print("\n--- Training complete. You can now run the prediction script. ---")


Ensuring NLTK data is available...
NLTK data is ready.

Loading data from IMDB Dataset.csv...
Data loaded successfully.

Preprocessing all reviews (this may take a while)...
Preprocessing complete.

Creating TF-IDF features...
TF-IDF features created.

Data split into training and testing sets.

Training the model...
Model training complete.

Evaluating the model...
Model Accuracy: 0.8909

Classification Report:
              precision    recall  f1-score   support

    Negative       0.90      0.88      0.89      5000
    Positive       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000


Saving model and vectorizer to disk...
Files 'sentiment_model.pkl' and 'tfidf_vectorizer.pkl' saved successfully.

--- Training complete. You can now run the prediction script. ---


In [5]:
!pip install streamlit pandas nltk scikit-learn joblib pyngrok

Collecting streamlit
  Downloading streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.4.0-py3-none-any.whl.metadata (8.1 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.50.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m112.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.4.0-py3-none-any.whl (25 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m102.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyngrok, pydeck, streamlit
Successfully installed pydeck-0.9.1 pyngrok-7.4.0 streamlit-1.50.0


In [7]:
%%writefile app.py

# This is the main script for the Streamlit web application.
# To run this app, save the code as 'app.py' and run the following
# command in your terminal (after activating your virtual environment):
# streamlit run app.py

import streamlit as st
import joblib
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
import ssl

# --- Page Configuration ---
st.set_page_config(
    page_title="Movie Review Sentiment Analyzer",
    page_icon="🎬",
    layout="centered",
    initial_sidebar_state="auto",
)

# --- NLTK Data Download ---
# This function is cached to run only once.
@st.cache_resource
def download_nltk_data():
    """
    Downloads all necessary NLTK data packs safely.
    """
    try:
        # Workaround for SSL certificate verification issue
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context

    # Download required packages
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    print("NLTK data downloaded successfully.")

# Call the function to ensure data is available
download_nltk_data()

# --- Load Model and Vectorizer ---
# The models are loaded once and cached for efficiency.
@st.cache_resource
def load_model_and_vectorizer():
    """
    Loads the pre-trained sentiment analysis model and TF-IDF vectorizer.
    Returns the model and vectorizer, or None if files are not found.
    """
    model_path = 'sentiment_model.pkl'
    vectorizer_path = 'tfidf_vectorizer.pkl'

    if not os.path.exists(model_path) or not os.path.exists(vectorizer_path):
        return None, None

    try:
        model = joblib.load(model_path)
        vectorizer = joblib.load(vectorizer_path)
        return model, vectorizer
    except Exception as e:
        st.error(f"Error loading model files: {e}")
        return None, None

model, vectorizer = load_model_and_vectorizer()

# --- Text Preprocessing Function ---
# This function must be identical to the one used during training.
def preprocess_text(text):
    """
    Cleans and preprocesses a single piece of text for prediction.
    """
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = re.sub(re.compile('<.*?>'), '', text)  # Remove HTML tags
    text = re.sub('[^a-zA-Z]', ' ', text)       # Keep only letters
    text = text.lower()                         # Convert to lowercase
    tokens = nltk.word_tokenize(text)
    # Lemmatize and remove stopwords
    cleaned_tokens = [
        lemmatizer.lemmatize(word) for word in tokens
        if word not in stop_words and len(word) > 1
    ]
    return " ".join(cleaned_tokens)

# --- Streamlit UI ---

# Main Title
st.title("🎬 Movie Review Sentiment Analyzer")
st.markdown("Enter a movie review below to determine if it's positive or negative. The model was trained on the IMDB dataset.")

# Check if model files are loaded
if model is None or vectorizer is None:
    st.error(
        "**Model files not found!** 😟\n\n"
        "Please run the `train_model.py` script first to train the model "
        "and generate the necessary `.pkl` files."
    )
else:
    # User Input Area
    st.subheader("Enter Your Movie Review")
    user_input = st.text_area(
        "Type or paste your review here...",
        height=150,
        placeholder="e.g., 'The movie was absolutely fantastic! The acting was superb and the plot was gripping.'"
    )

    # Analyze Button
    if st.button("Analyze Sentiment", type="primary"):
        if user_input.strip():
            # Preprocess the input
            processed_input = preprocess_text(user_input)

            # Vectorize the input
            input_vector = vectorizer.transform([processed_input]).toarray()

            # Make a prediction
            prediction = model.predict(input_vector)
            probability = model.predict_proba(input_vector)

            sentiment = 'Positive' if prediction[0] == 1 else 'Negative'
            confidence = probability[0][prediction[0]]

            # Display the result
            st.subheader("Analysis Result")
            if sentiment == 'Positive':
                st.success(f"**Sentiment: Positive** 👍 ({confidence:.2%} confidence)")
                st.balloons()
            else:
                st.error(f"**Sentiment: Negative** 👎 ({confidence:.2%} confidence)")
        else:
            st.warning("Please enter a review to analyze.")

Writing app.py


In [9]:
from pyngrok import ngrok

# --- IMPORTANT ---
# Get your authtoken from https://dashboard.ngrok.com/get-started/your-authtoken
# and paste it here:
authtoken = "33eDdoJHJx3L5S5ybXgZGov1478_2DSbx5G87gTHnbusbg1c8"
ngrok.set_auth_token(authtoken)

# Terminate open tunnels if any
ngrok.kill()

# Start streamlit in background
!nohup streamlit run app.py --server.port 8501 &

# Open a tunnel to the streamlit port
public_url = ngrok.connect(8501)
print(f"Click the following URL to view your Streamlit app: {public_url}")

nohup: appending output to 'nohup.out'
Click the following URL to view your Streamlit app: NgrokTunnel: "https://fencelike-ali-untributarily.ngrok-free.dev" -> "http://localhost:8501"
