In [41]:
import os
import pandas as pd
import numpy as np
import json
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.utils.class_weight import compute_class_weight
from scikeras.wrappers import KerasClassifier
from tensorflow import keras

### Load and explore data

In [7]:
# Load data
X_train = pd.read_csv("../data/X_train.csv")
X_test = pd.read_csv("../data/X_test.csv")
y_train = pd.read_csv("../data/y_train.csv").squeeze()
y_test = pd.read_csv("../data/y_test.csv").squeeze()

# Print exploratory dataset information
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("\nLabel distribution in training set:")
print(y_train.value_counts(normalize=True))

Train shape: (796, 595)
Test shape: (199, 595)

Label distribution in training set:
y
2    0.201005
6    0.154523
1    0.153266
0    0.141960
4    0.136935
3    0.115578
5    0.096734
Name: proportion, dtype: float64


### RF with PCA

In [32]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

pipe_rf = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("clf", RandomForestClassifier(class_weight="balanced_subsample", random_state=42))
])

param_grid = {
    "pca__n_components": [32, 64, 128],
    "clf__n_estimators": [500, 600, 700],
    "clf__max_depth": [None, 20],
    "clf__min_samples_leaf": [1, 2]
}

gs = GridSearchCV(pipe_rf, param_grid, scoring="f1_macro", cv=cv, n_jobs=-1, verbose=0)
gs.fit(X_train, y_train) # Use selected features

In [33]:
best = gs.best_estimator_
print("\n Best params:", gs.best_params_)
print("Best CV Macro-F1:", round(gs.best_score_, 3))

y_pred = best.predict(X_test)
print("\n RandomForest + PCA")
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Macro-F1:",round(f1_score(y_test, y_pred, average='macro'), 3))
print(classification_report(y_test, y_pred, zero_division=0))


 Best params: {'clf__max_depth': 20, 'clf__min_samples_leaf': 2, 'clf__n_estimators': 600, 'pca__n_components': 64}
Best CV Macro-F1: 0.258

 RandomForest + PCA
Accuracy: 0.276
Macro-F1: 0.23
              precision    recall  f1-score   support

           0       0.19      0.14      0.16        28
           1       0.13      0.07      0.09        30
           2       0.35      0.70      0.47        40
           3       0.31      0.22      0.26        23
           4       0.38      0.33      0.35        27
           5       0.14      0.10      0.12        20
           6       0.17      0.16      0.17        31

    accuracy                           0.28       199
   macro avg       0.24      0.25      0.23       199
weighted avg       0.25      0.28      0.25       199



### DenseNet with PCA

In [51]:
# Compute class weights for future data balancing
classes = np.unique(y_train)
cw = compute_class_weight('balanced', classes=classes, y=y_train)
class_weight = dict(zip(classes, cw))

# Constants for model building
n_classes = len(np.unique(y_train))
input_dim = X_train_f.shape[1]

def build_ds_ffnn(meta, hidden_units, hidden_layers, dropout, lr):
    input_dim = meta["n_features_in_"]
    n_classes = meta["n_classes_"]
    
    inputs = keras.Input(shape=(input_dim,))
    x = keras.layers.Dense(hidden_units, activation="relu")(inputs)
    skips = [x]

    for _ in range(hidden_layers - 1):
        y = keras.layers.Concatenate()(skips)
        y = keras.layers.Dense(hidden_units, activation="relu")(y)
        y = keras.layers.Dropout(dropout)(y)
        skips.append(y)

    x = keras.layers.Concatenate()(skips)
    outputs = keras.layers.Dense(n_classes, activation="softmax")(x)

    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(lr),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"],
    )
    return model

pipe_ds_ffnn = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("clf", KerasClassifier(
        model=build_ds_ffnn,
        epochs=80,
        batch_size=64,
        verbose=0,
        validation_split = 0.15,
        callbacks=[keras.callbacks.EarlyStopping(
            monitor="val_loss", patience=10, restore_best_weights=True
        )]
    ))
])

param_grid_ds_ffnn = {
    "pca__n_components": [32, 64, 128],
    "clf__model__hidden_units": [64, 128, 256],
    "clf__model__hidden_layers": [2, 3, 4],
    "clf__model__dropout": [0.1, 0.2],
    "clf__model__lr": [1e-3, 1e-4],
}

gs_ds_ffnn = GridSearchCV(
    pipe_ds_ffnn, param_grid_ds_ffnn,
    scoring="f1_macro", cv=cv, n_jobs=-1, verbose=0
)

In [52]:
gs_ds_ffnn.fit(X_train, y_train, clf__class_weight=class_weight)



In [54]:
best = gs_ds_ffnn.best_estimator_
print("\n Best params:", gs_ds_ffnn.best_params_)
print("Best CV Macro-F1:", round(gs_ds_ffnn.best_score_, 3))

y_pred = best.predict(X_test)
print("\n DenseNet + PCA")
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Macro-F1:",round(f1_score(y_test, y_pred, average='macro'), 3))
print(classification_report(y_test, y_pred, zero_division=0))


 Best params: {'clf__model__dropout': 0.2, 'clf__model__hidden_layers': 4, 'clf__model__hidden_units': 64, 'clf__model__lr': 0.001, 'pca__n_components': 64}
Best CV Macro-F1: 0.254

 DenseNet + PCA
Accuracy: 0.276
Macro-F1: 0.271
              precision    recall  f1-score   support

           0       0.14      0.14      0.14        28
           1       0.25      0.33      0.29        30
           2       0.44      0.30      0.36        40
           3       0.32      0.30      0.31        23
           4       0.33      0.44      0.38        27
           5       0.23      0.30      0.26        20
           6       0.20      0.13      0.16        31

    accuracy                           0.28       199
   macro avg       0.27      0.28      0.27       199
weighted avg       0.28      0.28      0.27       199



### OpenL3 Models

In [None]:
# Keep ONLY L3 embeddings (e0...e511)
def keep_l3_embeddings(df):
    def is_l3(col):
        if not col.startswith("e"): 
            return False
        suf = col[1:]
        if not suf.isdigit(): 
            return False
        idx = int(suf)
        return 0 <= idx <= 511
    l3_cols = [c for c in df.columns if is_l3(c)]
    return df[l3_cols]

X_train_f = keep_l3_embeddings(X_train)
X_test_f  = keep_l3_embeddings(X_test)

print("Shapes (features):", X_train_f.shape, X_test_f.shape)

In [None]:
# Evaluate function
def evaluate(name, est):
    est.fit(X_train_f, y_train)
    y_pred = est.predict(X_test_f)
    acc = accuracy_score(y_test, y_pred)
    mf1 = f1_score(y_test, y_pred, average="macro")
    print(f"\n=== {name} ===")
    print("Accuracy:", acc)
    print("Macro-F1:", mf1)
    print(classification_report(y_test, y_pred, zero_division=0)) 
    return name, acc, mf1

# CV folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#### Non-deep Learning Methods

In [36]:
# MODEL 1: LOGISTIC REGRESSION
pipe_lr = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LogisticRegression(max_iter=4000, class_weight="balanced", solver="lbfgs", random_state=42)),
])
gs_lr = GridSearchCV(pipe_lr, {"clf__C":[0.3, 1.0, 3.0]}, scoring="f1_macro", cv=cv, n_jobs=6)
gs_lr.fit(X_train_f, y_train)

# MODEL 2: LINEAR SVM
pipe_svm = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LinearSVC(C=1.0, class_weight="balanced", random_state=42)),
])
gs_svm = GridSearchCV(pipe_svm, {"clf__C":[0.3, 1.0, 3.0]}, scoring="f1_macro", cv=cv, n_jobs=6)
gs_svm.fit(X_train_f, y_train)

# MODEL 3: RANDOM FOREST
rf = RandomForestClassifier(
    n_estimators=500, max_depth=None, min_samples_leaf=1,
    class_weight="balanced_subsample", random_state=42
)



In [13]:
# Collects summaries to print
rows = []
rows.append(evaluate("LogReg (scaled, balanced)", gs_lr.best_estimator_))
rows.append(evaluate("LinearSVC (scaled, balanced)", gs_svm.best_estimator_))
rows.append(evaluate("RandomForest (balanced_subsample)", rf))

# Prints summaries, also outputs best parameters for function
summary = pd.DataFrame(rows, columns=["model","test_accuracy","test_macro_f1"]).sort_values("test_macro_f1", ascending=False)
print("Summaries (sorted on macro_f1)")
print(summary)
print("\nBest params:")
print("- LogReg:", gs_lr.best_params_)
print("- LinearSVC:", gs_svm.best_params_)


=== LogReg (scaled, balanced) ===
Accuracy: 0.17587939698492464
Macro-F1: 0.1689376750574134
              precision    recall  f1-score   support

           0       0.08      0.07      0.08        28
           1       0.09      0.10      0.10        30
           2       0.28      0.25      0.26        40
           3       0.20      0.22      0.21        23
           4       0.26      0.30      0.28        27
           5       0.11      0.10      0.11        20
           6       0.15      0.16      0.16        31

    accuracy                           0.18       199
   macro avg       0.17      0.17      0.17       199
weighted avg       0.17      0.18      0.17       199


=== LinearSVC (scaled, balanced) ===
Accuracy: 0.21105527638190955
Macro-F1: 0.1998066323303515
              precision    recall  f1-score   support

           0       0.10      0.07      0.08        28
           1       0.19      0.20      0.19        30
           2       0.30      0.30      0.30      

#### Deep Learning Methods

In [39]:
# Compute class weights for future data balancing
classes = np.unique(y_train)
cw = compute_class_weight('balanced', classes=classes, y=y_train)
class_weight = dict(zip(classes, cw))

# Constants for model building
n_classes = len(np.unique(y_train))
input_dim = X_train_f.shape[1]

In [46]:
def build_ds_ffnn(meta, hidden_units, hidden_layers, dropout, lr):
    input_dim = meta["n_features_in_"]
    n_classes = meta["n_classes_"]
    
    inputs = keras.Input(shape=(input_dim,))
    x = keras.layers.Dense(hidden_units, activation="relu")(inputs)
    skips = [x]

    for _ in range(hidden_layers - 1):
        y = keras.layers.Concatenate()(skips)
        y = keras.layers.Dense(hidden_units, activation="relu")(y)
        y = keras.layers.Dropout(dropout)(y)
        skips.append(y)

    x = keras.layers.Concatenate()(skips)
    outputs = keras.layers.Dense(n_classes, activation="softmax")(x)

    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(lr),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"],
    )
    return model

pipe_ds_ffnn = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", KerasClassifier(
        model=build_ds_ffnn,
        epochs=80,
        batch_size=64,
        verbose=0,
        validation_split = 0.15,
        callbacks=[keras.callbacks.EarlyStopping(
            monitor="val_loss", patience=10, restore_best_weights=True
        )]
    ))
])

param_grid_ds_ffnn = {
    "clf__model__hidden_units": [64, 128, 256],
    "clf__model__hidden_layers": [2, 3, 4],
    "clf__model__dropout": [0, 0.1, 0.2],
    "clf__model__lr": [1e-3, 1e-4, 1e-5],
}

gs_ds_ffnn = GridSearchCV(
    pipe_ds_ffnn, param_grid_ds_ffnn,
    scoring="f1_macro", cv=cv, n_jobs=-1, verbose=0
)

gs_ds_ffnn.fit(X_train_f, y_train, clf__class_weight=class_weight)

In [48]:
# Print evaluation and best parameters
print(evaluate("Open-L3 DenseNet", gs_ds_ffnn.best_estimator_))
print("\nBest params:")
print(gs_ds_ffnn.best_params_)


=== Open-L3 DenseNet ===
Accuracy: 0.24623115577889448
Macro-F1: 0.22158765444470285
              precision    recall  f1-score   support

           0       0.17      0.18      0.18        28
           1       0.22      0.17      0.19        30
           2       0.35      0.47      0.40        40
           3       0.22      0.17      0.20        23
           4       0.21      0.26      0.23        27
           5       0.18      0.10      0.13        20
           6       0.24      0.23      0.23        31

    accuracy                           0.25       199
   macro avg       0.23      0.23      0.22       199
weighted avg       0.24      0.25      0.24       199

('Open-L3 DenseNet', 0.24623115577889448, 0.22158765444470285)

Best params:
{'clf__model__dropout': 0.2, 'clf__model__hidden_layers': 2, 'clf__model__hidden_units': 128, 'clf__model__lr': 0.0001}
