<a href="https://colab.research.google.com/github/i-ganza007/Sentiment_Analysis_Formative/blob/main/Logistic_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('stopwords')

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression


import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    LSTM, Bidirectional, Dense, Dropout, Masking
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
path = '/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv'
df = pd.read_csv(path)

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['clean_review'] = df['review'].apply(preprocess_text)

df[['review', 'clean_review']].head()


Unnamed: 0,review,clean_review
0,One of the other reviewers has mentioned that ...,one reviewers mentioned watching oz episode yo...
1,A wonderful little production. <br /><br />The...,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter matteis love time money visually stunni...


In [None]:
df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1})

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['sentiment'])

train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['sentiment'])

print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")


Train size: 32000
Validation size: 8000
Test size: 10000


In [None]:
train_df['tokens'] = train_df['clean_review'].apply(word_tokenize)
val_df['tokens'] = val_df['clean_review'].apply(word_tokenize)
test_df['tokens'] = test_df['clean_review'].apply(word_tokenize)

train_df[['clean_review', 'tokens']].head()


Unnamed: 0,clean_review,tokens
26680,oh yes agree others describe appalling acting ...,"[oh, yes, agree, others, describe, appalling, ..."
16648,basic hook lincoln slow slowness represents th...,"[basic, hook, lincoln, slow, slowness, represe..."
29967,utter trash im huge fan cusacks sole reason wa...,"[utter, trash, im, huge, fan, cusacks, sole, r..."
34122,meet cosmo jason priestley nerdy young bookie ...,"[meet, cosmo, jason, priestley, nerdy, young, ..."
823,dont know people criticise show muchit great f...,"[dont, know, people, criticise, show, muchit, ..."


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_texts = train_df['tokens'].apply(lambda x: ' '.join(x))
val_texts = val_df['tokens'].apply(lambda x: ' '.join(x))
test_texts = test_df['tokens'].apply(lambda x: ' '.join(x))

tfidf = TfidfVectorizer(max_features=5000)

X_train_tfidf = tfidf.fit_transform(train_texts)

X_val_tfidf = tfidf.transform(val_texts)
X_test_tfidf = tfidf.transform(test_texts)

In [None]:
log_reg = LogisticRegression(max_iter=250, C=0.1, penalty='l2')
log_reg.fit(X_train_tfidf, train_df['sentiment'])

log_train_preds = log_reg.predict(X_train_tfidf)
train_accuracy = accuracy_score(train_df['sentiment'], log_train_preds)

log_val_preds = log_reg.predict(X_val_tfidf)
val_accuracy = accuracy_score(val_df['sentiment'], log_val_preds)

print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")

Train Accuracy: 0.8768
Validation Accuracy: 0.8698


In [None]:
logistic_probs = log_reg.predict_proba(X_test_tfidf)[:, 1]
logistic_preds = log_reg.predict(X_test_tfidf)

In [None]:
y_true = test_df['sentiment']

# Metrics
acc  = accuracy_score(y_true, logistic_preds)
prec = precision_score(y_true, logistic_preds)
rec  = recall_score(y_true, logistic_preds)
f1   = f1_score(y_true, logistic_preds)
auc  = roc_auc_score(y_true, logistic_probs)  # now works
cm   = confusion_matrix(y_true, logistic_preds)

print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1-score:  {f1:.4f}")
print(f"AUC:       {auc:.4f}")

print("Confusion Matrix:")
print(cm)

print("Classification Report:")
print(classification_report(y_true, logistic_preds, target_names=['Negative','Positive']))

Accuracy:  0.8692
Precision: 0.8560
Recall:    0.8878
F1-score:  0.8716
AUC:       0.9438
Confusion Matrix:
[[4253  747]
 [ 561 4439]]
Classification Report:
              precision    recall  f1-score   support

    Negative       0.88      0.85      0.87      5000
    Positive       0.86      0.89      0.87      5000

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

