In [1]:
# Step 1: Import Libraries and Load Data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


In [2]:
# Ensure NLTK stopwords are downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ivyajanga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load the IMDB movie review dataset
data = pd.read_csv('imdb_dataset.csv')

In [None]:
# Step 2: Data Cleaning and Preprocessing
stopwords = stopwords.words('english')
stemmer = PorterStemmer()

def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [stemmer.stem(word) for word in text if word not in stopwords]
    text = ' '.join(text)
    return text

data['review'] = data['review'].apply(clean_text)

In [8]:
# Step 3: Split the dataset into training and testing sets
X = data['review']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Step 4: Vectorize the text
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [10]:
# Step 5: Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [11]:

# Step 6: Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8821


In [12]:
# Additional evaluation metrics
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.89      0.87      0.88      4961
    positive       0.87      0.90      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [13]:

# Additional Step: Predict Sentiment for User Input
def predict_sentiment(input_text):
    # Clean and preprocess the input text
    cleaned_text = clean_text(input_text)

    # Vectorize the cleaned text
    input_vector = vectorizer.transform([cleaned_text])

    # Predict the sentiment
    sentiment_prediction = model.predict(input_vector)[0]

    # Interpret the prediction
    sentiment_label = "Positive" if sentiment_prediction == 1 else "Negative"

    return sentiment_label

In [None]:
# Example Usage:
user_input_review = input("Enter your movie review: ")
predicted_sentiment = predict_sentiment(user_input_review)

print(f"Predicted Sentiment: {predicted_sentiment}")