In [1]:
import sys
from pathlib import Path

project_root = Path().resolve().parents[0]
sys.path.append(str(project_root))

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import nltk
nltk.download('punkt', download_dir=os.path.expanduser('~/nltk_data'))
nltk.download('stopwords', download_dir=os.path.expanduser('~/nltk_data'))
nltk.download('vader_lexicon', download_dir=os.path.expanduser('~/nltk_data'))
nltk.download('punkt_tab', download_dir=os.path.expanduser('~/nltk_data'))

from src.data_loader import find_data_dir, load_data, clean_html

[nltk_data] Downloading package punkt to C:\Users\jacka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jacka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jacka/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jacka/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Load data

In [None]:
data_dir = find_data_dir()
csv_path = os.path.join(data_dir, "raw", "IMDB Dataset.csv")
df = pd.read_csv(csv_path)

: 

## Exploratory data analysis

#### Look at the dataset structure

In [None]:
df.head()

In [None]:
df['sentiment'].value_counts()

In [None]:
df.info()

In [None]:
# Missing values
df.isnull().sum()


In [None]:
# Empty or whitespace-only reviews
df['empty_review'] = df['review'].str.strip().eq('')
df['empty_review'].sum()


In [None]:
# Duplicate reviews
df.duplicated(subset='review').sum()


Observation - It is a relatively balance dataset with total 50K rows split evenly between positive and negative reviews (25K each). There are no null values and empty reviews, but there are ~420 duplicate reviews

In [None]:
df['char_count'] = df['review'].str.len()
df['word_count'] = df['review'].apply(lambda x: len(x.split()))
df[['char_count', 'word_count']].describe()

In [None]:
(df[['char_count', 'word_count']]
 .groupby(df['sentiment'])
 .describe().T)

In [None]:
plt.figure()
sns.boxplot(x='sentiment', y='word_count', data=df)
plt.title("Word Count by Sentiment")
plt.show()


In [None]:
plt.figure()
sns.histplot(df,x='word_count', bins=50, 
             hue='sentiment', element="step",
             multiple="dodge",
             )
plt.title("Word Count Distribution")
plt.show()


In [None]:
df['clean_review'] = df['review'].apply(clean_html)

In [None]:
# stop_words = set(stopwords.words('english'))

# def get_tokens(text):
#     return word_tokenize(text.lower())

# # tokens = df['clean_review'].apply(get_tokens)


In [None]:
df.head()

In [None]:
#N grams analysis

vectorizer = CountVectorizer(
    ngram_range=(2,2),
    min_df=5,
    stop_words='english'
)

X_bigrams = vectorizer.fit_transform(df['clean_review'])

bigram_freq = np.asarray(X_bigrams.sum(axis=0)).flatten()
bigrams = vectorizer.get_feature_names_out()

top_bigrams = sorted(
    zip(bigrams, bigram_freq),
    key=lambda x: x[1],
    reverse=True
)[:20]

top_bigrams


In [None]:
tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=5,
    max_df=0.9,
    stop_words='english'
)

X_tfidf = tfidf.fit_transform(df['clean_review'])

feature_names = tfidf.get_feature_names_out()
idf_scores = tfidf.idf_

top_tfidf = sorted(
    zip(feature_names, idf_scores),
    key=lambda x: x[1],
    reverse=True
)[:20]

top_tfidf


In [None]:
#Train, validation, test split

from sklearn.model_selection import train_test_split

X = df["clean_review"]
y = (df["sentiment"]=="positive").astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

In [None]:
import joblib

data_dir = find_data_dir()
processed_path = os.path.join(data_dir, "processed")

# Save data using joblib
joblib.dump(X_train, os.path.join(processed_path, "X_train.pkl"))
joblib.dump(X_test, os.path.join(processed_path, "X_test.pkl"))
joblib.dump(y_train, os.path.join(processed_path, "y_train.pkl"))
joblib.dump(y_test, os.path.join(processed_path, "y_test.pkl"))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    min_df=5,
    stop_words="english"
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)
print(f"TF-IDF Train shape: {X_train_tfidf.shape}")
print(f"TF-IDF Test shape: {X_test_tfidf.shape}")

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = lr.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
import seaborn as sns

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
feature_names = tfidf.get_feature_names_out()
coef = lr.coef_[0]

top_pos = sorted(zip(coef, feature_names), reverse=True)[:20]
top_neg = sorted(zip(coef, feature_names))[:20]

top_pos, top_neg


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l2"]
}

grid = GridSearchCV(
    LogisticRegression(max_iter=1000),
    param_grid,
    scoring="f1",
    cv=3,
    n_jobs=-1
)

grid.fit(X_train_tfidf, y_train)
print("Best parameters:", grid.best_params_)
best_lr = grid.best_estimator_
accuracy_score(y_test, best_lr.predict(X_test_tfidf))



Tree Based models

In [None]:
tfidf_tree = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1,2),
    min_df=10,
    stop_words="english"
)

X_train_tree = tfidf_tree.fit_transform(X_train)
X_test_tree  = tfidf_tree.transform(X_test)


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_tree, y_train)
y_pred_rf = rf.predict(X_test_tree)
print("Random Forest Test Accuracy:", accuracy_score(y_test, y_pred_rf))

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss"
)

xgb.fit(X_train_tree, y_train)
y_pred_xgb = xgb.predict(X_test_tree)
print("XGBoost Test Accuracy:", accuracy_score(y_test, y_pred_xgb))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_xgb = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1,1),
    min_df=10,
    stop_words="english"
)

X_train_xgb = tfidf_xgb.fit_transform(X_train)
X_test_xgb  = tfidf_xgb.transform(X_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    n_jobs=-1,
    tree_method='hist'
)

param_dist = {
    "n_estimators": [100, 150, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 3, 5],
    "reg_alpha": [0, 0.1, 1],
    "reg_lambda": [1, 5, 10]
}

random_search = RandomizedSearchCV(
    xgb,
    param_distributions=param_dist,
    n_iter=15,
    scoring="f1",
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train_xgb, y_train)

In [None]:
y_pred_xgb_rs = random_search.predict(X_test_xgb)
print("XGBoost Test Accuracy:", accuracy_score(y_test, y_pred_xgb_rs))

In [None]:
import os
import pickle
import joblib  


repo_root = os.path.dirname(find_data_dir())  # go up from 'data' to repo root
models_dir = os.path.join(repo_root, "models")

# Create models folder if it doesn't exist
os.makedirs(models_dir, exist_ok=True)

# Save models using joblib
joblib.dump(best_lr, os.path.join(models_dir, "logistic_regression.pkl"))