In [17]:
# shared_setup.py
import numpy as np
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# --------- 1. Load IMDB train/test ----------
train_path = "aclImdb/train"
test_path  = "aclImdb/test"

train_data = load_files(
    train_path,
    categories=['pos', 'neg'],
    encoding='utf-8'
)
test_data = load_files(
    test_path,
    categories=['pos', 'neg'],
    encoding='utf-8'
)

X_all = train_data.data
y_all = train_data.target
X_test_text = test_data.data
y_test = test_data.target

print("Train samples:", len(X_all))
print("Test samples :", len(X_test_text))

# --------- 2. Split into train / validation ----------
X_train_text, X_val_text, y_train, y_val = train_test_split(
    X_all, y_all,
    test_size=0.2,
    random_state=42,
    stratify=y_all
)

print("Train:", len(X_train_text))
print("Val  :", len(X_val_text))
print("Test :", len(X_test_text))

def report_errors(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    err = 1.0 - acc
    print(f"{name} accuracy: {acc:.4f} | error rate: {err:.4f}")


Train samples: 25000
Test samples : 25000
Train: 20000
Val  : 5000
Test : 25000


In [19]:
# logistic_tfidf_grid.py
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# from shared_setup import X_train_text, X_val_text, X_test_text, y_train, y_val, y_test, report_errors

# ---------- Pipeline: TF–IDF + LogisticRegression ----------
logreg_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LogisticRegression(max_iter=1000, solver="liblinear"))
])

# ---------- Grid of hyperparameters ----------
param_grid = {
    "tfidf__ngram_range": [(1,1), (1,2)],      # unigrams vs uni+bi
    "tfidf__max_features": [10000, 20000],
    "clf__C": [0.1, 1.0, 10.0],                # regularization strength
}

grid = GridSearchCV(
    logreg_pipeline,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    scoring="accuracy",
    verbose=1
)

# ---------- Fit grid search only on TRAIN ----------
grid.fit(X_train_text, y_train)

print("Best params:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)

best_logreg = grid.best_estimator_

# ---------- Evaluate on TRAIN / VAL / TEST ----------
y_train_pred = best_logreg.predict(X_train_text)
y_val_pred   = best_logreg.predict(X_val_text)
y_test_pred  = best_logreg.predict(X_test_text)

print("\n=== Logistic Regression with TF–IDF + n-grams ===")
report_errors("Train", y_train, y_train_pred)
report_errors("Val  ", y_val,   y_val_pred)
report_errors("Test ", y_test,  y_test_pred)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best params: {'clf__C': 10.0, 'tfidf__max_features': 20000, 'tfidf__ngram_range': (1, 2)}
Best CV accuracy: 0.8916998119490964

=== Logistic Regression with TF–IDF + n-grams ===
Train accuracy: 0.9916 | error rate: 0.0084
Val   accuracy: 0.9022 | error rate: 0.0978
Test  accuracy: 0.8940 | error rate: 0.1060


In [None]:
# svm_tfidf_grid.py
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# from shared_setup import X_train_text, X_val_text, X_test_text, y_train, y_val, y_test, report_errors

svm_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LinearSVC())   # linear SVM for text
])

param_grid = {
    "tfidf__ngram_range": [(1,1), (1,2)],
    "tfidf__max_features": [10000, 20000],
    "clf__C": [0.01, 0.1, 1.0, 10.0],
}

grid = GridSearchCV(
    svm_pipeline,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    scoring="accuracy",
    verbose=1
)

grid.fit(X_train_text, y_train)

print("Best params:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)

best_svm = grid.best_estimator_

y_train_pred = best_svm.predict(X_train_text)
y_val_pred   = best_svm.predict(X_val_text)
y_test_pred  = best_svm.predict(X_test_text)

print("\n=== Linear SVM with TF–IDF ===")
report_errors("Train", y_train, y_train_pred)
report_errors("Val  ", y_val,   y_val_pred)
report_errors("Test ", y_test,  y_test_pred)


In [None]:
# bilstm_imdb_grid.py
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import ParameterGrid
from collections import Counter
import re

# from shared_setup import X_train_text, X_val_text, X_test_text, y_train, y_val, y_test, report_errors

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------- 1. Simple tokenizer ----------
def simple_tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9]+", " ", text)
    return text.strip().split()

# ---------- 2. Build vocabulary ----------
def build_vocab(texts, min_freq=5, max_size=20000):
    counter = Counter()
    for t in texts:
        counter.update(simple_tokenize(t))
    # Reserve 0 for PAD, 1 for UNK
    vocab = {"<PAD>": 0, "<UNK>": 1}
    for word, freq in counter.most_common():
        if freq < min_freq:
            break
        if len(vocab) >= max_size:
            break
        vocab[word] = len(vocab)
    return vocab

vocab = build_vocab(X_train_text)
vocab_size = len(vocab)
print("Vocab size:", vocab_size)

def encode(text, vocab, max_len=300):
    tokens = simple_tokenize(text)
    ids = [vocab.get(tok, vocab["<UNK>"]) for tok in tokens][:max_len]
    if len(ids) < max_len:
        ids += [vocab["<PAD>"]] * (max_len - len(ids))
    return ids

# ---------- 3. Dataset ----------
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=300):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        ids = torch.tensor(
            encode(self.texts[idx], self.vocab, self.max_len),
            dtype=torch.long
        )
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return ids, label

train_dataset = TextDataset(X_train_text, y_train, vocab)
val_dataset   = TextDataset(X_val_text,   y_val,   vocab)
test_dataset  = TextDataset(X_test_text,  y_test,  vocab)

# ---------- 4. BiLSTM model ----------
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, dropout, num_classes=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embed_dim,
            hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0,
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        emb = self.embedding(x)          # (B, T, E)
        out, (h_n, c_n) = self.lstm(emb) # h_n: (num_layers*2, B, H)
        # Concatenate last forward and backward hidden states
        h_forward = h_n[-2,:,:]
        h_backward = h_n[-1,:,:]
        h = torch.cat([h_forward, h_backward], dim=1)  # (B, 2H)
        h = self.dropout(h)
        logits = self.fc(h)
        return logits

# ---------- 5. Training / eval loops ----------
def train_one_model(params):
    print("\nTraining BiLSTM with params:", params)
    model = BiLSTMClassifier(
        vocab_size=vocab_size,
        embed_dim=params["embed_dim"],
        hidden_dim=params["hidden_dim"],
        num_layers=params["num_layers"],
        dropout=params["dropout"],
        num_classes=2
    ).to(device)

    train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=params["batch_size"])
    test_loader  = DataLoader(test_dataset,  batch_size=params["batch_size"])

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=params["lr"])

    best_val_acc = 0.0
    best_state = None

    for epoch in range(params["epochs"]):
        model.train()
        total_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()
            logits = model(X_batch)
            loss = criterion(logits, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * X_batch.size(0)

        # validation
        model.eval()
        y_val_all, y_val_pred_all = [], []
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)
                logits = model(X_batch)
                preds = torch.argmax(logits, dim=1)
                y_val_all.append(y_batch.cpu())
                y_val_pred_all.append(preds.cpu())
        y_val_all = torch.cat(y_val_all).numpy()
        y_val_pred_all = torch.cat(y_val_pred_all).numpy()
        val_acc = accuracy_score(y_val_all, y_val_pred_all)
        print(f"Epoch {epoch+1}: val_acc={val_acc:.4f}")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_state = model.state_dict()

    # load best state
    model.load_state_dict(best_state)

    # compute train/val/test errors
    def eval_loader(loader, y_true_full):
        model.eval()
        y_pred_all = []
        with torch.no_grad():
            for X_batch, _ in loader:
                X_batch = X_batch.to(device)
                logits = model(X_batch)
                preds = torch.argmax(logits, dim=1)
                y_pred_all.append(preds.cpu())
        y_pred_all = torch.cat(y_pred_all).numpy()
        return y_pred_all

    train_loader_eval = DataLoader(train_dataset, batch_size=params["batch_size"])
    val_loader_eval   = DataLoader(val_dataset,   batch_size=params["batch_size"])
    test_loader_eval  = DataLoader(test_dataset,  batch_size=params["batch_size"])

    y_train_pred = eval_loader(train_loader_eval, y_train)
    y_val_pred   = eval_loader(val_loader_eval,   y_val)
    y_test_pred  = eval_loader(test_loader_eval,  y_test)

    print("\n=== BiLSTM Results ===")
    report_errors("Train", y_train, y_train_pred)
    report_errors("Val  ", y_val,   y_val_pred)
    report_errors("Test ", y_test,  y_test_pred)

    return best_val_acc

# ---------- 6. Manual "grid search" ----------
param_grid = {
    "embed_dim":  [100],
    "hidden_dim": [128, 256],
    "num_layers": [1, 2],
    "dropout":    [0.3, 0.5],
    "batch_size": [64],
    "lr":         [1e-3],
    "epochs":     [5],   # bump if you have time/GPU
}

best_overall_val = 0.0
best_params = None

for params in ParameterGrid(param_grid):
    val_acc = train_one_model(params)
    if val_acc > best_overall_val:
        best_overall_val = val_acc
        best_params = params

print("\nBest BiLSTM config:", best_params)
print("Best BiLSTM val acc:", best_overall_val)


In [None]:
# distilbert_imdb_grid.py
import torch
from torch.utils.data import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer, TrainingArguments
)
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score

# from shared_setup import X_train_text, X_val_text, X_test_text, y_train, y_val, y_test, report_errors

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

max_length = 256

class IMDBDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.encodings = tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=max_length
        )
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = IMDBDataset(X_train_text, y_train, tokenizer, max_length)
val_dataset   = IMDBDataset(X_val_text,   y_val,   tokenizer, max_length)
test_dataset  = IMDBDataset(X_test_text,  y_test,  tokenizer, max_length)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

def run_one_config(params):
    print("\nFine-tuning DistilBERT with params:", params)
    model = DistilBertForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2
    )

    training_args = TrainingArguments(
        output_dir="./distilbert_imdb",
        num_train_epochs=params["epochs"],
        per_device_train_batch_size=params["batch_size"],
        per_device_eval_batch_size=params["batch_size"],
        learning_rate=params["lr"],
        evaluation_strategy="epoch",
        save_strategy="no",
        logging_steps=100,
        load_best_model_at_end=False,
        report_to=[],
        no_cuda=(not torch.cuda.is_available()),
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Eval on train/val/test
    train_metrics = trainer.evaluate(train_dataset)
    val_metrics   = trainer.evaluate(val_dataset)
    test_metrics  = trainer.evaluate(test_dataset)

    print("\n=== DistilBERT Results ===")
    report_errors("Train", y_train, trainer.predict(train_dataset).predictions.argmax(axis=-1))
    report_errors("Val  ", y_val,   trainer.predict(val_dataset).predictions.argmax(axis=-1))
    report_errors("Test ", y_test,  trainer.predict(test_dataset).predictions.argmax(axis=-1))

    return val_metrics["eval_accuracy"]

param_grid = {
    "lr":      [2e-5, 5e-5],
    "batch_size": [16],
    "epochs":  [2, 3],
}

best_val = 0.0
best_params = None

for params in ParameterGrid(param_grid):
    val_acc = run_one_config(params)
    if val_acc > best_val:
        best_val = val_acc
        best_params = params

print("\nBest DistilBERT config:", best_params)
print("Best DistilBERT val acc:", best_val)


In [16]:
import numpy as np
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# ---------------------------------------------------------------
# 1. Load TRAIN and TEST sets
# ---------------------------------------------------------------
personal_path = "/Users/jensonphan/git-test"
train_path = f"{personal_path}/aclImdb/train"
test_path  = f"{personal_path}/aclImdb/test"

train_data = load_files( train_path, categories=['pos', 'neg'], encoding='utf-8' )
test_data = load_files( test_path, categories=['pos', 'neg'], encoding='utf-8' )
# Train
X_text_all = train_data.data
y_all       = train_data.target
# Test
X_test_text = test_data.data
y_test      = test_data.target

# ---------------------------------------------------------------
# 2. Split TRAIN into train/validation for model selection
# ---------------------------------------------------------------
X_train_text, X_val_text, y_train, y_val = train_test_split(
    X_text_all, y_all,
    test_size=0.2,
    random_state=42,
    stratify=y_all
)

print("Training samples:", len(X_train_text))
print("Validation samples:", len(X_val_text))

# ---------------------------------------------------------------
# 3. Vectorizer (fit ONLY on training part during model selection)
# ---------------------------------------------------------------
vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 1)  # unigrams only; you can change this later
)

X_train = vectorizer.fit_transform(X_train_text)
X_val   = vectorizer.transform(X_val_text)

print("Train matrix shape:", X_train.shape)

# ---------------------------------------------------------------
# 4. Model selection phase: train + validate
#    (Here we just use a single config; you could GridSearch instead)
# ---------------------------------------------------------------
model = LogisticRegression(
    max_iter=1000,
    C=1.0,
    solver='liblinear'
)

model.fit(X_train, y_train)

y_val_pred = model.predict(X_val)
val_acc = accuracy_score(y_val, y_val_pred)
print("\n=== Validation Performance (for model selection) ===")
print("Validation Accuracy:", round(val_acc * 100, 2), "%")
print(classification_report(y_val, y_val_pred, target_names=['neg', 'pos']))

# ----------------------------------------------------------------
# >>> At this point, you would normally:
#     - compare different C, ngram_range, etc.
#     - pick the best config based on validation scores.
#     We’ll assume the current config is the chosen one.
# ----------------------------------------------------------------

# ---------------------------------------------------------------
# 5. FINAL TRAINING: retrain on FULL TRAINING DATA (train + val)
# ---------------------------------------------------------------

# Combine train + val text and labels
X_full_text = X_train_text + X_val_text   # lists of strings can be concatenated
y_full      = np.concatenate([y_train, y_val])

print("\nRetraining final model on full training data (train + val)...")
print("Full training samples:", len(X_full_text))

# Refit vectorizer on ALL training text
# (this lets it learn from all available labeled data)
vectorizer_final = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 1)
)
X_full = vectorizer_final.fit_transform(X_full_text)

# New model instance (same hyperparameters as chosen above)
final_model = LogisticRegression(
    max_iter=1000,
    C=1.0,
    solver='liblinear'
)

final_model.fit(X_full, y_full)

# ---------------------------------------------------------------
# 6. FINAL EVALUATION ON HELD-OUT TEST SET
# ---------------------------------------------------------------
# Transform test data with the FINAL vectorizer
X_test = vectorizer_final.transform(X_test_text)

y_test_pred = final_model.predict(X_test)
test_acc = accuracy_score(y_test, y_test_pred)

print("\n=== FINAL TEST PERFORMANCE (unseen data) ===")
print("Test Accuracy:", round(test_acc * 100, 2), "%")
print(classification_report(y_test, y_test_pred, target_names=['neg', 'pos']))


Loaded train samples: 25000
Loaded test samples : 25000
Training samples: 20000
Validation samples: 5000
Train matrix shape: (20000, 20000)

=== Validation Performance (for model selection) ===
Validation Accuracy: 89.12 %
              precision    recall  f1-score   support

         neg       0.89      0.89      0.89      2500
         pos       0.89      0.90      0.89      2500

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000


Retraining final model on full training data (train + val)...
Full training samples: 25000

=== FINAL TEST PERFORMANCE (unseen data) ===
Test Accuracy: 88.37 %
              precision    recall  f1-score   support

         neg       0.88      0.88      0.88     12500
         pos       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg     

In [15]:
import numpy as np
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

dataset_path_train = "/Users/jensonphan/git-test/aclImdb/train"  # adjust for your environment
data_train = load_files(dataset_path_train, categories=['pos', 'neg'], encoding='utf-8')
X_text_train = data.data; y_train = data_train.target
print("Loaded", len(X_text_train), "training samples.")

dataset_path_test = "/Users/jensonphan/git-test/aclImdb/test"  # adjust for your environment
data_test = load_files(dataset_path_test, categories=['pos', 'neg'], encoding='utf-8')
X_text_test = data.data; y_test = data.target
print("Loaded", len(X_text_test), "testing samples.")

vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,1))
X_train_vec = vectorizer.fit_transform(X_text_train)
X_test_vec = vectorizer.transform(X_text_test)
model = LogisticRegression( max_iter=1000, C=1.0, solver='liblinear')
model.fit(X_train_vec, y_train)
y_test_pred = model.predict(X_test_vec)
test_acc = accuracy_score(y_test, y_test_pred)

print("\Testing Accuracy:", round(test_acc * 100, 2), "%")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

y_train_pred = model.predict(X_train_vec)
training_acc = accuracy_score(y_train, y_train_pred)
print("\Training Accuracy:", round(training_acc * 100, 2), "%")
print(classification_report(y_train, y_train_pred))


Loaded 25000 training samples.
Loaded 25000 testing samples.
\Testing Accuracy: 92.89 %

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.92      0.93     12500
           1       0.93      0.93      0.93     12500

    accuracy                           0.93     25000
   macro avg       0.93      0.93      0.93     25000
weighted avg       0.93      0.93      0.93     25000

\Training Accuracy: 92.89 %
              precision    recall  f1-score   support

           0       0.93      0.92      0.93     12500
           1       0.93      0.93      0.93     12500

    accuracy                           0.93     25000
   macro avg       0.93      0.93      0.93     25000
weighted avg       0.93      0.93      0.93     25000



In [5]:
import numpy as np
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
# ---------------------------------------------------------------
# 1. Load IMDB dataset (assumes local folder with pos/neg subfolders)
# ---------------------------------------------------------------
dataset_path = "/Users/jensonphan/git-test/aclImdb/train"  # adjust for your environment
data = load_files(dataset_path, categories=['pos', 'neg'], encoding='utf-8')

X_text = data.data; y = data.target
print("Loaded", len(X_text), "training samples.")

# ---------------------------------------------------------------
# 2. Train/Validation Split
# ---------------------------------------------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

print("Training samples:", len(X_train))
print("Validation samples:", len(X_val))

# ---------------------------------------------------------------
# 3. Vectorize with Bag-of-Words or TF–IDF
# ---------------------------------------------------------------
# Option 1: CountVectorizer (Bag of Words)
# vectorizer = CountVectorizer(max_features=20001, ngram_range=(1,1))

# Option 2: TF–IDF (Recommended)
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,1))
X_train_vec = vectorizer.fit_transform(X_train)
#X_test_vec = vectorizer.fit_transform(X_)
X_val_vec = vectorizer.transform(X_val)

print("Vectorized shape:", X_train_vec.shape)

# ---------------------------------------------------------------
# 4. Train Logistic Regression
# ---------------------------------------------------------------
model = LogisticRegression(
    max_iter=1000,
    C=1.0,              # regularization
    solver='liblinear'  # good for sparse text data
)

model.fit(X_train_vec, y_train)

# ---------------------------------------------------------------
# 5. Validation Performance
# ---------------------------------------------------------------
y_val_pred = model.predict(X_val_vec)
val_accuracy = accuracy_score(y_val, y_val_pred)

print("\nValidation Accuracy:", round(val_accuracy * 100, 2), "%")
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

Loaded 25000 training samples.
Training samples: 20000
Validation samples: 5000
Vectorized shape: (20000, 20000)

Validation Accuracy: 89.12 %

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89      2500
           1       0.89      0.90      0.89      2500

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000



In [9]:
import numpy as np
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
# ---------------------------------------------------------------
# 1. Load IMDB dataset (assumes local folder with pos/neg subfolders)
# ---------------------------------------------------------------
dataset_path = "/Users/jensonphan/git-test/aclImdb/train"  # adjust for your environment
data = load_files(dataset_path, categories=['pos', 'neg'], encoding='utf-8')

X_text = data.data; y = data.target

print("Loaded", len(X_text), "training samples.")

# ---------------------------------------------------------------
# 2. Train/Validation Split
# ---------------------------------------------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

print("Training samples:", len(X_train))
print("Validation samples:", len(X_val))

# ---------------------------------------------------------------
# 3. Vectorize with Bag-of-Words or TF–IDF
# ---------------------------------------------------------------
# Option 1: CountVectorizer (Bag of Words)
vectorizer = CountVectorizer(max_features=90000, ngram_range=(1,1))

# Option 2: TF–IDF (Recommended)
#vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,1))
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

print("Vectorized shape:", X_train_vec.shape)

# ---------------------------------------------------------------
# 4. Train Logistic Regression
# ---------------------------------------------------------------
model = LogisticRegression(
    max_iter=1000,
    C=1.0,              # regularization
    solver='liblinear'  # good for sparse text data
)

model.fit(X_train_vec, y_train)

# ---------------------------------------------------------------
# 5. Validation Performance
# ---------------------------------------------------------------
y_val_pred = model.predict(X_val_vec)
val_accuracy = accuracy_score(y_val, y_val_pred)

print("\nValidation Accuracy:", round(val_accuracy * 100, 2), "%")
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))


Loaded 25000 training samples.
Training samples: 20000
Validation samples: 5000
Vectorized shape: (20000, 68439)

Validation Accuracy: 87.86 %

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      2500
           1       0.88      0.88      0.88      2500

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



In [6]:
import numpy as np
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# ---------------------------------------------------------------
# 1. Load IMDB dataset (train portion)
# ---------------------------------------------------------------
dataset_path = "aclImdb/train"  # adjust if needed

data = load_files(
    dataset_path,
    categories=['pos', 'neg'],
    encoding='utf-8'
)

X_text = data.data       # list of review strings
y = data.target          # 0/1 labels: e.g. 0=neg, 1=pos

print("Loaded", len(X_text), "training samples.")

# ---------------------------------------------------------------
# 2. Train/Validation Split
# ---------------------------------------------------------------
X_train_text, X_val_text, y_train, y_val = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

print("Training samples:", len(X_train_text))
print("Validation samples:", len(X_val_text))

# ---------------------------------------------------------------
# 3. Vectorize with Bag-of-Words (word counts)
# ---------------------------------------------------------------
vectorizer = CountVectorizer(
    max_features=20000,
    ngram_range=(1, 1),  # unigrams only for now
)

X_train = vectorizer.fit_transform(X_train_text)
X_val = vectorizer.transform(X_val_text)

print("BoW matrix shape:", X_train.shape)  # (n_samples, vocab_size)

# ---------------------------------------------------------------
# 4. Baseline 1: Majority Class Classifier
# ---------------------------------------------------------------
majority_clf = DummyClassifier(strategy='most_frequent')

majority_clf.fit(X_train, y_train)

y_val_pred_majority = majority_clf.predict(X_val)
val_acc_majority = accuracy_score(y_val, y_val_pred_majority)

print("\n=== Majority Class Classifier ===")
print("Validation Accuracy:", round(val_acc_majority * 100, 2), "%")
print(classification_report(y_val, y_val_pred_majority, target_names=['neg', 'pos']))

# ---------------------------------------------------------------
# 5. Baseline 2: Multinomial Naive Bayes
# ---------------------------------------------------------------
nb_clf = MultinomialNB(alpha=1.0)  # Laplace smoothing

nb_clf.fit(X_train, y_train)

y_val_pred_nb = nb_clf.predict(X_val)
val_acc_nb = accuracy_score(y_val, y_val_pred_nb)

print("\n=== Multinomial Naive Bayes ===")
print("Validation Accuracy:", round(val_acc_nb * 100, 2), "%")
print(classification_report(y_val, y_val_pred_nb, target_names=['neg', 'pos']))



Loaded 25000 training samples.
Training samples: 20000
Validation samples: 5000
BoW matrix shape: (20000, 20000)

=== Majority Class Classifier ===
Validation Accuracy: 50.0 %
              precision    recall  f1-score   support

         neg       0.50      1.00      0.67      2500
         pos       0.00      0.00      0.00      2500

    accuracy                           0.50      5000
   macro avg       0.25      0.50      0.33      5000
weighted avg       0.25      0.50      0.33      5000


=== Multinomial Naive Bayes ===
Validation Accuracy: 84.38 %
              precision    recall  f1-score   support

         neg       0.82      0.88      0.85      2500
         pos       0.87      0.81      0.84      2500

    accuracy                           0.84      5000
   macro avg       0.85      0.84      0.84      5000
weighted avg       0.85      0.84      0.84      5000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
