<a href="https://colab.research.google.com/github/itsarbababdurrab/Movie_Review_Sentiment_Analysis/blob/main/Movie_Review_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import ipywidgets as widgets
from IPython.display import display

from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# Download NLTK stopwords and tokenizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [8]:
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Random Projects/Movie Review Sentiment Analysis/IMDB Dataset.csv')

In [9]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Removing special characters
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)

In [10]:
# Apply preprocessing
df['review'] = df['review'].apply(preprocess_text)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})  # Convert labels to binary

In [11]:
# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

In [12]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [13]:
# Train model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [14]:
# Evaluate model
y_pred = model.predict(X_test_tfidf)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'F1 Score: {f1_score(y_test, y_pred)}')

Accuracy: 0.8884
F1 Score: 0.8908450704225352


In [15]:
# Interactive interface
def predict_sentiment(review):
    processed_review = preprocess_text(review)
    vectorized_review = vectorizer.transform([processed_review])
    prediction = model.predict(vectorized_review)[0]
    sentiment = 'Positive' if prediction == 1 else 'Negative'
    print(f'Sentiment: {sentiment}')

In [16]:
# Creating a text input widget
review_input = widgets.Textarea(placeholder='Enter a movie review...')
button = widgets.Button(description='Predict Sentiment')
output = widgets.Output()

def on_button_click(b):
    with output:
        output.clear_output()
        predict_sentiment(review_input.value)

button.on_click(on_button_click)
display(review_input, button, output)

Textarea(value='', placeholder='Enter a movie review...')

Button(description='Predict Sentiment', style=ButtonStyle())

Output()