# Sentiment classifier: TF-IDF + Logistic Regression
This notebook trains a TF-IDF + Logistic Regression baseline on the IMDB dataset, performs GridSearchCV, inspects top features, and includes an optional DistilBERT fine-tune (GPU recommended).

In [None]:
# Install required packages (Colab or local)
!pip install -q datasets scikit-learn transformers torch sentencepiece matplotlib seaborn


In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


In [None]:
# Load IMDB dataset using Hugging Face datasets
dataset = load_dataset("imdb")
train_ds = dataset["train"].shuffle(seed=42)
test_ds = dataset["test"].shuffle(seed=42)

# Optional: reduce size for quick runs (uncomment to use)
# train_ds = train_ds.select(range(5000))
# test_ds  = test_ds.select(range(2000))

df_train = pd.DataFrame({"text": train_ds["text"], "label": train_ds["label"]})
df_test  = pd.DataFrame({"text": test_ds["text"],  "label": test_ds["label"]})

X_train, X_val, y_train, y_val = train_test_split(
    df_train["text"], df_train["label"], test_size=0.2, random_state=42, stratify=df_train["label"]
)
print("Training examples:", len(X_train))
print("Validation examples:", len(X_val))
print("Test examples:", len(df_test))


In [None]:
# Baseline pipeline: TF-IDF + Logistic Regression
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        strip_accents='unicode',
        stop_words='english',
        ngram_range=(1,2),
        min_df=5,
        max_df=0.9
    )),
    ("clf", LogisticRegression(solver="saga", penalty="l2", max_iter=2000, n_jobs=-1))
])

pipeline.fit(X_train, y_train)
y_val_pred = pipeline.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, digits=4))
cm = confusion_matrix(y_val, y_val_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Validation confusion matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()


In [None]:
# Hyperparameter tuning with GridSearchCV (short grid for speed)
param_grid = {
    "tfidf__ngram_range": [(1,1), (1,2)],
    "tfidf__min_df": [2,5],
    "tfidf__max_df": [0.85, 0.95],
    "clf__C": [0.1, 1.0]
}
gs = GridSearchCV(pipeline, param_grid, cv=3, scoring="f1", n_jobs=-1, verbose=2)
gs.fit(X_train, y_train)

print("Best CV F1 score:", gs.best_score_)
print("Best params:", gs.best_params_)

best_model = gs.best_estimator_
y_val_pred = best_model.predict(X_val)
print("Validation Accuracy (best):", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, digits=4))


In [None]:
# Final test evaluation
y_test_pred = best_model.predict(df_test["text"])
print("Test Accuracy:", accuracy_score(df_test["label"], y_test_pred))
print(classification_report(df_test["label"], y_test_pred, digits=4))


In [None]:
# Inspect top features per class
vec = best_model.named_steps["tfidf"]
clf = best_model.named_steps["clf"]
feature_names = np.array(vec.get_feature_names_out())
topn = 25
coef = clf.coef_[0]
top_pos = np.argsort(coef)[-topn:][::-1]
top_neg = np.argsort(coef)[:topn]
print("Top positive tokens:", ", ".join(feature_names[top_pos]))
print("Top negative tokens:", ", ".join(feature_names[top_neg]))


## Optional: DistilBERT fine-tune (GPU recommended)
Run this section only if you have GPU available. It fine-tunes DistilBERT on a subset for speed.

In [None]:
import torch
if torch.cuda.is_available():
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

    model_name = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

    hf_train = dataset["train"].shuffle(seed=42).select(range(8000)).map(tokenize, batched=True)
    hf_test  = dataset["test"].shuffle(seed=42).select(range(2000)).map(tokenize, batched=True)

    hf_train.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) 
    hf_test.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) 

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    training_args = TrainingArguments(
        output_dir="./distilbert-sentiment",
        evaluation_strategy="epoch",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        num_train_epochs=2,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_steps=100,
        save_strategy="no",
        fp16=torch.cuda.is_available()
    )

    def compute_metrics(eval_pred):
        preds, labels = eval_pred
        preds = np.argmax(preds, axis=1)
        return {"accuracy": (preds == labels).mean()}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=hf_train,
        eval_dataset=hf_test,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_results = trainer.evaluate()
    print(eval_results)
else:
    print("GPU not available: skip DistilBERT fine-tune")


## Notes and suggestions
- For quick iteration, reduce dataset sizes as shown above.
- Tune TF-IDF and classifier hyperparameters based on validation performance.
- Use cross-validation for robust estimates when dataset size permits.
- DistilBERT training requires GPU for reasonable speed.