In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/content/IMDBDataset.csv')


In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
print(df.shape)

(50000, 2)


**Data PreProcessing**

In [None]:
# cleaning text Lowercasing
df['clean_text']= df['review'].str.lower()

In [None]:
df['clean_text'].head()

Unnamed: 0,clean_text
0,one of the other reviewers has mentioned that ...
1,a wonderful little production. <br /><br />the...
2,i thought this was a wonderful way to spend ti...
3,basically there's a family where a little boy ...
4,"petter mattei's ""love in the time of money"" is..."


In [None]:
# removing the Punctuation

In [None]:
import string
# before removing
print("Before punctuation removal:\n", df["clean_text"].iloc[0][:300])
# After removing
df['clean_text'] = df['clean_text'].apply(lambda x: x.translate
 (str.maketrans("", "", string.punctuation)))
print("\nAfter punctuation removal:\n", df["clean_text"].iloc[0][:300])


Before punctuation removal:
 one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.<br /><br />the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. tru

After punctuation removal:
 one of the other reviewers has mentioned that after watching just 1 oz episode youll be hooked they are right as this is exactly what happened with mebr br the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this i


In [None]:
# removing the numbers

# before removing
print(df['clean_text'].iloc[10][:300])
print('\n')
def remove_number(text):
  result=''
  for ch in text:
    if not ch.isdigit():
      result+=ch
  return result

df['clean_text'] = df['clean_text'].apply(remove_number)
print('\n')
# After removing
print(df['clean_text'].iloc[10][:300])

phil the alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlinesbr br at first it was very odd and pretty funny but as the movie progressed i didnt find the jokes or oddness funny anymorebr br its a low budget film thats never a pro




phil the alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlinesbr br at first it was very odd and pretty funny but as the movie progressed i didnt find the jokes or oddness funny anymorebr br its a low budget film thats never a pro


In [None]:
# removing the links and url
import re
def removing_links(text):
  return re.sub(r'http\S+|www\S+', '',text)

# Apply to columns
df['clean_text'] = df['clean_text'].apply(removing_links)

# print
print('\n', df['clean_text'].iloc[15][:300])


 kind of drawn in by the erotic scenes only to realize this was one of the most amateurish and unbelievable bits of film ive ever seen sort of like a high school film project what was rosanna arquette thinking and what was with all those stock characters in that bizarre supposed midwest town pretty h


In [None]:
import re

def remove_html_tags(text):
    # This will remove anything between < >
    return re.sub(r'<.*?>', '', text)

# Apply to column
df["clean_text"] = df["clean_text"].apply(remove_html_tags)

# Show example
print(df["clean_text"].iloc[20][:300])


after the success of die hard and its sequels its no surprise really that in the s a glut of die hard on a  movies cashed in on the wrong guy wrong place wrong time concept that is what they did with cliffhanger die hard on a mountain just in time to rescue sly stop or my mom will shoot stallones ca


In [None]:
df['clean_text'].iloc[1][:300]

'a wonderful little production br br the filming technique is very unassuming very oldtimebbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece br br the actors are extremely well chosen michael sheen not only has got all the polari but he has all the voi'

In [None]:
# removing emojies and special Character
def removing_emojies(text):
  return text.encode('ascii', 'ignore').decode('ascii')

  # Apply
  df['clean_text'] = df['clean_text'].apply(removing_emojies)



In [None]:
df['clean_text'].iloc[30][:300]

'taut and organically gripping edward dmytryks crossfire is a distinctive suspense thriller an unlikely message movie using the look and devices of the noir cyclebr br bivouacked in washington dc a company of soldiers cope with their restlessness by hanging out in bars three of them end up at a stran'

In [None]:
# removing the stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
  words = text.split()
  filtered = [w for w in words if w.lower() not in stop_words]
  return " ".join(filtered)

  # Apply function
df["clean_text"] = df["clean_text"].apply(remove_stopwords)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df['clean_text'].iloc[1][:300]

'wonderful little production br br filming technique unassuming oldtimebbc fashion gives comforting sometimes discomforting sense realism entire piece br br actors extremely well chosen michael sheen got polari voices pat truly see seamless editing guided references williams diary entries well worth '

In [None]:
df['clean_text'].head()

Unnamed: 0,clean_text
0,one reviewers mentioned watching oz episode yo...
1,wonderful little production br br filming tech...
2,thought wonderful way spend time hot summer we...
3,basically theres family little boy jake thinks...
4,petter matteis love time money visually stunni...


In [None]:
# rename the clean_text to reviews
df.rename(columns={'clean_text':'reviews'}, inplace=True)

In [None]:
df['reviews'].head()

Unnamed: 0,reviews
0,one reviewers mentioned watching oz episode yo...
1,wonderful little production br br filming tech...
2,thought wonderful way spend time hot summer we...
3,basically theres family little boy jake thinks...
4,petter matteis love time money visually stunni...


***VECTORIZATION***

In [None]:
# BOW
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_features = 5000)
X_bow = bow_vectorizer.fit_transform(df['reviews'])
# Check shape
print("BoW feature matrix shape:", X_bow.shape)


BoW feature matrix shape: (50000, 5000)


In [None]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features = 5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['reviews'])

# Check shape
print("TF-IDF feature matrix shape:", X_tfidf.shape)

TF-IDF feature matrix shape: (50000, 5000)


In [None]:
y=df['sentiment']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

**TFDIF**

In [None]:
X=X_tfidf
y=df['sentiment']
# spliting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
# 80% Training 20% Testing


In [None]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

In [None]:
# Predictions
y_pred = lr_model.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Detailed report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8866
Classification Report:
               precision    recall  f1-score   support

    negative       0.90      0.87      0.88      4961
    positive       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Confusion Matrix:
 [[4329  632]
 [ 502 4537]]


**BOW**

In [None]:
X=X_bow
y=df['sentiment']
# spliting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
# 80% Training 20% Testing


In [None]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

In [None]:
# Predictions
y_pred = lr_model.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Detailed report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8745
Classification Report:
               precision    recall  f1-score   support

    negative       0.88      0.87      0.87      4961
    positive       0.87      0.88      0.88      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

Confusion Matrix:
 [[4300  661]
 [ 594 4445]]


***Accuracy from TF-IDF vectorizer is high***

In [None]:
from sklearn.naive_bayes import MultinomialNB
X = X_tfidf
y = df['sentiment']

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize the model
nb_model = MultinomialNB()

# Train
nb_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = nb_model.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Detailed metrics
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8495
Classification Report:
               precision    recall  f1-score   support

    negative       0.85      0.85      0.85      4961
    positive       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

Confusion Matrix:
 [[4197  764]
 [ 741 4298]]


In [None]:
from sklearn.svm import LinearSVC

# Using TF-IDF features (X_tfidf) and target y
X = X_tfidf
y = df['sentiment']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize Linear SVM
svm_model = LinearSVC(max_iter=10000)  # increase max_iter if needed

# Train
svm_model.fit(X_train, y_train)

In [None]:
# Predictions
y_pred = svm_model.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Detailed classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8805
Classification Report:
               precision    recall  f1-score   support

    negative       0.89      0.87      0.88      4961
    positive       0.87      0.89      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

Confusion Matrix:
 [[4313  648]
 [ 547 4492]]


***HyperParameterTunning***

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear'],
    'penalty': ['l2'],
    'max_iter': [1000]
}


In [None]:
# Initialize Logistic Regression
lr = LogisticRegression()

# Grid search with 5-fold cross-validation
grid = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)


In [None]:
grid.fit(X_train, y_train)


Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [None]:
# Best hyperparameters
print("Best Hyperparameters:", grid.best_params_)

# Predictions
y_pred = grid.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Detailed metrics
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Best Hyperparameters: {'C': 1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy: 0.8866
Classification Report:
               precision    recall  f1-score   support

    negative       0.90      0.87      0.88      4961
    positive       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Confusion Matrix:
 [[4329  632]
 [ 502 4537]]


In [None]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=15000,    # Top 15k frequent words
    ngram_range=(1,2),     # Unigrams + bigrams
    min_df=2,              # Ignore words appearing <2 times
    max_df=0.9             # Ignore very common words (>90% of docs)
)


In [None]:
lr_model = LogisticRegression(
    C=1.0,                 # Regularization strength
    solver='lbfgs',     # Efficient for smaller/medium datasets
    penalty='l2',
    max_iter=1000
)


In [None]:
X_tfidf = tfidf_vectorizer.fit_transform(df['reviews'])
y = df['sentiment']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)


In [None]:
lr_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = lr_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8973
Classification Report:
               precision    recall  f1-score   support

    negative       0.91      0.88      0.90      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Confusion Matrix:
 [[4385  576]
 [ 451 4588]]


In [None]:
import joblib

# Save TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# Save trained Logistic Regression model
joblib.dump(lr_model, 'sentiment_model.pkl')


['sentiment_model.pkl']