In [7]:
REGENERATE_DATA = False

In [8]:
from utils import dataloader as dl
import pickle
import gzip
from pathlib import Path

pkl_path = "amazon_reviews_preproc.pkl.gz"
if REGENERATE_DATA:
    X_train, y_train, X_test, y_test = dl.load_data("/Users/ifigeneiastathaki/Desktop/projects/LDA&regression/data/raw_data/electronics_small.csv")
    with gzip.open(pkl_path, "wb") as f:
        pickle.dump(
            {
                "X_train": X_train,
                "y_train": y_train,
                "X_test":  X_test,
                "y_test":  y_test,
            },
            f,
            protocol=pickle.HIGHEST_PROTOCOL,
        )

    print(f"Saved preprocessed data to {pkl_path.resolve()}")

else:
    with gzip.open(pkl_path, "rb") as f:
        data = pickle.load(f)
    X_train = data["X_train"]
    y_train = data["y_train"]
    X_test  = data["X_test"]
    y_test  = data["y_test"]

In [9]:
bow_corpus_train, id2word = dl.bagofwords(X_train)
bow_corpus_test = [id2word.doc2bow(text) for text in X_test]

BoW: 100%|██████████| 2399630/2399630 [04:22<00:00, 9155.63it/s]  


In [18]:
from pathlib import Path
import numpy as np, matplotlib.pyplot as plt, pickle, gzip
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from tqdm.auto import tqdm

from utils.model import train_lda, corpus_to_dense, train_logistic_regression
import importlib, utils.model as ml
ml = importlib.reload(ml)

# PARAMETERS
K_grid        = [20,40,60,80,100,120,140,160]
sample_frac   = 0.10         # first sweep
val_frac      = 0.10         # inside that 10 %
random_state  = 42
out_dir       = Path("lda_grid")
out_dir.mkdir(exist_ok=True)

# STRATIFIED sample 10 % for the sweep
sss = StratifiedShuffleSplit(n_splits=1, train_size=sample_frac,
                             random_state=random_state)
sample_idx, _ = next(sss.split(np.zeros(len(y_train)), y_train))

bow_sample = [bow_corpus_train[i] for i in sample_idx]
tok_sample = [X_train[i]         for i in sample_idx]
y_sample   = y_train[sample_idx]

# remove empties
mask       = [bool(doc) for doc in bow_sample]
bow_sample = [d for d, ok in zip(bow_sample, mask) if ok]
tok_sample = [t for t, ok in zip(tok_sample, mask) if ok]
y_sample   = y_sample[mask]

# Train/validation split INSIDE sample
train_idx, val_idx = train_test_split(
    np.arange(len(bow_sample)),
    test_size=val_frac,
    stratify=y_sample,
    random_state=random_state,
)
bow_train_sub  = [bow_sample[i] for i in train_idx]
bow_val_sub    = [bow_sample[i] for i in val_idx]
y_train_sub    = y_sample[train_idx]
y_val_sub      = y_sample[val_idx]

# Grid search over K
scores = []
for K in tqdm(K_grid, desc="LDA grid"):
    lda = train_lda(
        bow_train_sub, id2word,
        k=K, passes=3, iterations=25,
        workers=None
    )

    # infer θ
    X_train_theta = corpus_to_dense(lda, bow_train_sub)
    X_val_theta   = corpus_to_dense(lda, bow_val_sub)

    # fixed logistic regressor (class-weight balanced)
    clf = train_logistic_regression(
        X_train_theta, y_train_sub,
        balance=False, 
        class_weight = True,         # no undersample, we rely on class_weight
        C=1.0,                  # keep logistic params constant
        max_iter=2000
    )

    y_pred_val = clf.predict(X_val_theta)
    f1 = f1_score(y_val_sub, y_pred_val, average="macro")
    scores.append((K, f1))
    print(f"K={K}: macro-F1={f1:.4f}")

# Choose best K
best_K, best_f1 = max(scores, key=lambda t: t[1])
print(f"Best K = {best_K} (val macro-F1 = {best_f1:.4f})")

# plot curve
plt.figure(figsize=(6,4))
plt.plot(*zip(*scores), marker="o")
plt.title("Validation macro-F1 vs K")
plt.xlabel("Num. topics (K)"); plt.ylabel("Macro F1")
plt.grid(True)
plt.savefig(out_dir / "val_f1_vs_K.png", dpi=150, bbox_inches="tight")
plt.close()
print("Saved figure →", out_dir / "val_f1_vs_K.png")

# Retrain BEST K on 25 % sample
sss2 = StratifiedShuffleSplit(n_splits=1, train_size=0.25,
                              random_state=random_state)
idx2, _ = next(sss2.split(np.zeros(len(y_train)), y_train))
bow_train_big = [bow_corpus_train[i] for i in idx2 if bow_corpus_train[i]]

lda_best = train_lda(
    bow_train_big, id2word,
    k=best_K, passes=3, iterations=25,
    workers=None
)

# θ-vectors for ALL docs, save to disk
X_train_vec = corpus_to_dense(lda_best, bow_corpus_train)
X_test_vec  = corpus_to_dense(lda_best, bow_corpus_test)

with gzip.open("theta_vectors_bestK.pkl.gz", "wb") as f:
    pickle.dump({"train": X_train_vec, "test": X_test_vec,
                 "K": best_K, "passes":3, "iterations":25}, f,
                protocol=pickle.HIGHEST_PROTOCOL)
print("Saved θ-vectors → theta_vectors_bestK.pkl.gz")


LDA grid:   0%|          | 0/2 [00:00<?, ?it/s]

Training LDA: K=1, passes=3, iterations=25, workers=7
LDA training complete


LDA grid:  50%|█████     | 1/2 [00:27<00:27, 27.61s/it]

K=1: macro-F1=0.0248
Training LDA: K=3, passes=3, iterations=25, workers=7
LDA training complete


LDA grid: 100%|██████████| 2/2 [01:20<00:00, 40.40s/it]

K=3: macro-F1=0.1762
Best K = 3 (val macro-F1 = 0.1762)





Saved figure → lda_grid/val_f1_vs_K.png
Training LDA: K=3, passes=3, iterations=25, workers=7
LDA training complete
Saved θ-vectors → theta_vectors_bestK.pkl.gz


In [19]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from collections import defaultdict

X_train = np.asarray(X_train_vec)  
X_test  = np.asarray(X_test_vec)
y_train = np.asarray(y_train)

#
rng = np.random.default_rng(42)

# find indices per class
indices_by_class = defaultdict(list)
for idx, label in enumerate(y_train):
    indices_by_class[label].append(idx)

min_size = min(len(lst) for lst in indices_by_class.values())

balanced_idx = []
for label, idx_list in indices_by_class.items():
    balanced_idx.extend(rng.choice(idx_list, size=min_size, replace=False))

balanced_idx = np.array(balanced_idx)
X_balanced   = X_train[balanced_idx]
y_balanced   = y_train[balanced_idx]

logreg = LogisticRegression(
    penalty="l2",          
    C=1.0,                 
    solver="lbfgs",        
    max_iter=1000,
    n_jobs=-1,        
    multi_class="multinomial",  
    class_weight = 'balanced'
)

# logreg= LogisticRegression(solver='saga', max_iter=2000,multi_class='multinomial')

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))



Accuracy: 0.3074638111177047
              precision    recall  f1-score   support

           1      0.108     0.482     0.176     41155
           2      0.070     0.035     0.047     27232
           3      0.092     0.269     0.137     45648
           4      0.163     0.105     0.128    104056
           5      0.701     0.368     0.483    381817

    accuracy                          0.307    599908
   macro avg      0.227     0.252     0.194    599908
weighted avg      0.492     0.307     0.354    599908



In [6]:
import numpy as np
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from gensim import matutils


vocab_size = len(id2word)

X_train_bow = matutils.corpus2csc(
    bow_corpus_train,
    num_terms=vocab_size,
    dtype=float,
).T                               

X_test_bow = matutils.corpus2csc(
    bow_corpus_test,
    num_terms=vocab_size,
    dtype=float,
).T                                


y_train = np.asarray(y_train)
y_test  = np.asarray(y_test)


rng = np.random.default_rng(42)

indices_by_class = defaultdict(list)
for idx, label in enumerate(y_train):
    indices_by_class[label].append(idx)

min_size = min(len(lst) for lst in indices_by_class.values())

balanced_idx = np.concatenate([
    rng.choice(lst, size=min_size, replace=False)
    for lst in indices_by_class.values()
])

X_balanced = X_train_bow[balanced_idx]  
y_balanced = y_train[balanced_idx]

print("Balanced each class to", min_size, "samples")

# Logistic regression training
logreg = LogisticRegression(
    penalty="l2",
    C=1.0,
    solver="saga",           
    max_iter=1000,
    multi_class="multinomial",
    n_jobs=-1,          
    random_state=42,
)
logreg.fit(X_balanced, y_balanced)

# Evaluate on the full test set
y_pred = logreg.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

Balanced each class to 108929 samples




Accuracy: 0.5222484114230849
              precision    recall  f1-score   support

           1      0.249     0.705     0.369     41155
           2      0.179     0.254     0.210     27232
           3      0.215     0.254     0.233     45648
           4      0.305     0.314     0.309    104056
           5      0.821     0.611     0.700    381817

    accuracy                          0.522    599908
   macro avg      0.354     0.427     0.364    599908
weighted avg      0.617     0.522     0.552    599908

