**Set working directory to folder with dataset**

In [1]:
import os

os.chdir('C:\\Users\\timk\\OneDrive\\Desktop\\full_dataset_folder') # Change to your own folder structure @Linka

os.getcwd()

'C:\\Users\\timk\\OneDrive\\Desktop\\full_dataset_folder'

**Load and explore data files**

In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.dummy import DummyClassifier

# Load data
X_train = pd.read_csv("X_train.csv")
X_test = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv").squeeze()
y_test = pd.read_csv("y_test.csv").squeeze()

# Print exploratory dataset information
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("\nLabel distribution in training set:")
print(y_train.value_counts(normalize=True))


Train shape: (796, 595)
Test shape: (199, 595)

Label distribution in training set:
y
2    0.201005
6    0.154523
1    0.153266
0    0.141960
4    0.136935
3    0.115578
5    0.096734
Name: proportion, dtype: float64


**Train two baselines: 1) majority and 2) random classifier**

In [15]:
# BASELINE 1: Majority classifier
majority_baseline = DummyClassifier(strategy="most_frequent")
majority_baseline.fit(X_train, y_train)
y_pred_majority = majority_baseline.predict(X_test)

print("MAJORITY BASELINE")
print("Accuracy:", accuracy_score(y_test, y_pred_majority))
print("Macro-F1:", f1_score(y_test, y_pred_majority, average="macro"))
print(classification_report(y_test, y_pred_majority, zero_division=0)) # zero_division=0 is added to avoid annoying warning

# BASELINE 2: Random classifier
random_baseline = DummyClassifier(strategy="uniform", random_state=42)
random_baseline.fit(X_train, y_train)
y_pred_random = random_baseline.predict(X_test)

print("RANDOM BASELINE")
print("Accuracy:", accuracy_score(y_test, y_pred_random))
print("Macro-F1:", f1_score(y_test, y_pred_random, average="macro"))
print(classification_report(y_test, y_pred_random, zero_division=0)) # zero_division=0 is added to avoid annoying warning

MAJORITY BASELINE
Accuracy: 0.20100502512562815
Macro-F1: 0.04781829049611476
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.00      0.00      0.00        30
           2       0.20      1.00      0.33        40
           3       0.00      0.00      0.00        23
           4       0.00      0.00      0.00        27
           5       0.00      0.00      0.00        20
           6       0.00      0.00      0.00        31

    accuracy                           0.20       199
   macro avg       0.03      0.14      0.05       199
weighted avg       0.04      0.20      0.07       199

RANDOM BASELINE
Accuracy: 0.1708542713567839
Macro-F1: 0.166605585836632
              precision    recall  f1-score   support

           0       0.21      0.18      0.19        28
           1       0.19      0.17      0.18        30
           2       0.23      0.15      0.18        40
           3       0.06      0.09  

**Train and evaluate three models: LogReg, LinearSVM, RF (DROP OPENL3 FEATURES, ONLY ESSENTIA FEATURES)**

In [30]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# Drop L3 embeddings (vague function from ChatGPT)
def drop_l3_embeddings(df):
    def is_l3(col):
        if not col.startswith("e"): return False
        suf = col[1:]
        if not suf.isdigit(): return False
        idx = int(suf)
        return 0 <= idx <= 511
    cols_to_drop = [c for c in df.columns if is_l3(c)]
    return df.drop(columns=cols_to_drop, errors="ignore")

X_train_f = drop_l3_embeddings(X_train)
X_test_f  = drop_l3_embeddings(X_test)

print("Shapes (features):", X_train_f.shape, X_test_f.shape)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate function
def evaluate(name, est):
    est.fit(X_train_f, y_train)
    y_pred = est.predict(X_test_f)
    acc = accuracy_score(y_test, y_pred)
    mf1 = f1_score(y_test, y_pred, average="macro")
    print(f"\n=== {name} ===")
    print("Accuracy:", acc)
    print("Macro-F1:", mf1)
    print(classification_report(y_test, y_pred, zero_division=0)) # i am not exactly sure what zero division does but it helped me get rid of an annoying warning
    return name, acc, mf1

# MODEL 1: LOGISTIC REGRESSION
pipe_lr = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LogisticRegression(max_iter=4000, class_weight="balanced", solver="lbfgs", random_state=42)),
])
gs_lr = GridSearchCV(pipe_lr, {"clf__C":[0.3, 1.0, 3.0]}, scoring="f1_macro", cv=cv, n_jobs=-1)
gs_lr.fit(X_train_f, y_train)

# MODEL 2: LINEAR SVM
pipe_svm = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LinearSVC(C=1.0, class_weight="balanced", random_state=42)),
])
gs_svm = GridSearchCV(pipe_svm, {"clf__C":[0.3, 1.0, 3.0]}, scoring="f1_macro", cv=cv, n_jobs=-1)
gs_svm.fit(X_train_f, y_train)

# MODEL 3: RANDOM FOREST
rf = RandomForestClassifier(
    n_estimators=500, max_depth=None, min_samples_leaf=1,
    class_weight="balanced_subsample", random_state=42
)

# Collects summaries to print
rows = []
rows.append(evaluate("LogReg (scaled, balanced)", gs_lr.best_estimator_))
rows.append(evaluate("LinearSVC (scaled, balanced)", gs_svm.best_estimator_))
rows.append(evaluate("RandomForest (balanced_subsample)", rf))

# Prints summaries, also outputs best parameters for function
summary = pd.DataFrame(rows, columns=["model","test_accuracy","test_macro_f1"]).sort_values("test_macro_f1", ascending=False)
print("Summaries (sorted on macro_f1)")
print(summary)
print("\nBest params:")
print("- LogReg:", gs_lr.best_params_)
print("- LinearSVC:", gs_svm.best_params_)


Shapes (features): (796, 83) (199, 83)

=== LogReg (scaled, balanced) ===
Accuracy: 0.22110552763819097
Macro-F1: 0.21230367287598054
              precision    recall  f1-score   support

           0       0.15      0.14      0.15        28
           1       0.22      0.17      0.19        30
           2       0.36      0.35      0.35        40
           3       0.19      0.17      0.18        23
           4       0.24      0.30      0.26        27
           5       0.21      0.30      0.25        20
           6       0.11      0.10      0.10        31

    accuracy                           0.22       199
   macro avg       0.21      0.22      0.21       199
weighted avg       0.22      0.22      0.22       199


=== LinearSVC (scaled, balanced) ===
Accuracy: 0.20603015075376885
Macro-F1: 0.19239616485518127
              precision    recall  f1-score   support

           0       0.12      0.11      0.12        28
           1       0.20      0.13      0.16        30
        

**Train and evaluate three models: LogReg, LinearSVM, RF (ONLY OPENL3 FEATURES, DROP ESSENTIA FEATURES)**


Takes a bit of time to run (couple of minutes), mostly informative to compare with the previous models.
Code remains the same except for which functions are kept, rest is copied and pasted.

In [41]:
# Keep ONLY L3 embeddings (e0...e511)
def keep_l3_embeddings(df):
    def is_l3(col):
        if not col.startswith("e"): 
            return False
        suf = col[1:]
        if not suf.isdigit(): 
            return False
        idx = int(suf)
        return 0 <= idx <= 511
    l3_cols = [c for c in df.columns if is_l3(c)]
    return df[l3_cols]

X_train_f = keep_l3_embeddings(X_train)
X_test_f  = keep_l3_embeddings(X_test)

print("Shapes (features):", X_train_f.shape, X_test_f.shape)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate function
def evaluate(name, est):
    est.fit(X_train_f, y_train)
    y_pred = est.predict(X_test_f)
    acc = accuracy_score(y_test, y_pred)
    mf1 = f1_score(y_test, y_pred, average="macro")
    print(f"\n=== {name} ===")
    print("Accuracy:", acc)
    print("Macro-F1:", mf1)
    print(classification_report(y_test, y_pred, zero_division=0)) 
    return name, acc, mf1

# MODEL 1: LOGISTIC REGRESSION
pipe_lr = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LogisticRegression(max_iter=4000, class_weight="balanced", solver="lbfgs", random_state=42)),
])
gs_lr = GridSearchCV(pipe_lr, {"clf__C":[0.3, 1.0, 3.0]}, scoring="f1_macro", cv=cv, n_jobs=-1)
gs_lr.fit(X_train_f, y_train)

# MODEL 2: LINEAR SVM
pipe_svm = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LinearSVC(C=1.0, class_weight="balanced", random_state=42)),
])
gs_svm = GridSearchCV(pipe_svm, {"clf__C":[0.3, 1.0, 3.0]}, scoring="f1_macro", cv=cv, n_jobs=-1)
gs_svm.fit(X_train_f, y_train)

# MODEL 3: RANDOM FOREST
rf = RandomForestClassifier(
    n_estimators=500, max_depth=None, min_samples_leaf=1,
    class_weight="balanced_subsample", random_state=42
)

# Collects summaries to print
rows = []
rows.append(evaluate("LogReg (scaled, balanced)", gs_lr.best_estimator_))
rows.append(evaluate("LinearSVC (scaled, balanced)", gs_svm.best_estimator_))
rows.append(evaluate("RandomForest (balanced_subsample)", rf))

# Prints summaries, also outputs best parameters for function
summary = pd.DataFrame(rows, columns=["model","test_accuracy","test_macro_f1"]).sort_values("test_macro_f1", ascending=False)
print("Summaries (sorted on macro_f1)")
print(summary)
print("\nBest params:")
print("- LogReg:", gs_lr.best_params_)
print("- LinearSVC:", gs_svm.best_params_)


Shapes (features): (796, 512) (199, 512)

=== LogReg (scaled, balanced) ===
Accuracy: 0.17587939698492464
Macro-F1: 0.1689376750574134
              precision    recall  f1-score   support

           0       0.08      0.07      0.08        28
           1       0.09      0.10      0.10        30
           2       0.28      0.25      0.26        40
           3       0.20      0.22      0.21        23
           4       0.26      0.30      0.28        27
           5       0.11      0.10      0.11        20
           6       0.15      0.16      0.16        31

    accuracy                           0.18       199
   macro avg       0.17      0.17      0.17       199
weighted avg       0.17      0.18      0.17       199


=== LinearSVC (scaled, balanced) ===
Accuracy: 0.21608040201005024
Macro-F1: 0.20347968696657567
              precision    recall  f1-score   support

           0       0.12      0.07      0.09        28
           1       0.23      0.27      0.25        30
       

**Train and evaluate three models: LogReg, LinearSVM, RF (ALL FEATURES L3 AND ESSENTIA)**


Uses ALL the features that are in the dataset, without PCA or feature selection.

In [43]:
X_train_f = X_train
X_test_f  = X_test

print("Shapes (features):", X_train_f.shape, X_test_f.shape)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate function
def evaluate(name, est):
    est.fit(X_train_f, y_train)
    y_pred = est.predict(X_test_f)
    acc = accuracy_score(y_test, y_pred)
    mf1 = f1_score(y_test, y_pred, average="macro")
    print(f"\n=== {name} ===")
    print("Accuracy:", acc)
    print("Macro-F1:", mf1)
    print(classification_report(y_test, y_pred, zero_division=0)) 
    return name, acc, mf1

# MODEL 1: LOGISTIC REGRESSION
pipe_lr = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LogisticRegression(max_iter=4000, class_weight="balanced", solver="lbfgs", random_state=42)),
])
gs_lr = GridSearchCV(pipe_lr, {"clf__C":[0.3, 1.0, 3.0]}, scoring="f1_macro", cv=cv, n_jobs=-1)
gs_lr.fit(X_train_f, y_train)

# MODEL 2: LINEAR SVM
pipe_svm = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LinearSVC(C=1.0, class_weight="balanced", random_state=42)),
])
gs_svm = GridSearchCV(pipe_svm, {"clf__C":[0.3, 1.0, 3.0]}, scoring="f1_macro", cv=cv, n_jobs=-1)
gs_svm.fit(X_train_f, y_train)

# MODEL 3: RANDOM FOREST
rf = RandomForestClassifier(
    n_estimators=500, max_depth=None, min_samples_leaf=1,
    class_weight="balanced_subsample", random_state=42
)

# Collects summaries to print
rows = []
rows.append(evaluate("LogReg (scaled, balanced)", gs_lr.best_estimator_))
rows.append(evaluate("LinearSVC (scaled, balanced)", gs_svm.best_estimator_))
rows.append(evaluate("RandomForest (balanced_subsample)", rf))

# Prints summaries, also outputs best parameters for function
summary = pd.DataFrame(rows, columns=["model","test_accuracy","test_macro_f1"]).sort_values("test_macro_f1", ascending=False)
print("Summaries (sorted on macro_f1)")
print(summary)
print("\nBest params:")
print("- LogReg:", gs_lr.best_params_)
print("- LinearSVC:", gs_svm.best_params_)


Shapes (features): (796, 595) (199, 595)

=== LogReg (scaled, balanced) ===
Accuracy: 0.20100502512562815
Macro-F1: 0.19173698241494855
              precision    recall  f1-score   support

           0       0.11      0.11      0.11        28
           1       0.12      0.10      0.11        30
           2       0.35      0.35      0.35        40
           3       0.18      0.17      0.18        23
           4       0.21      0.26      0.23        27
           5       0.21      0.25      0.23        20
           6       0.14      0.13      0.14        31

    accuracy                           0.20       199
   macro avg       0.19      0.20      0.19       199
weighted avg       0.20      0.20      0.20       199


=== LinearSVC (scaled, balanced) ===
Accuracy: 0.19597989949748743
Macro-F1: 0.183676715309054
              precision    recall  f1-score   support

           0       0.14      0.14      0.14        28
           1       0.19      0.20      0.19        30
        

**RANDOM FOREST WITH PCA**


RF is clearly the most succesful option, so I decided to continue with that to try PCA. (I also tried other options but I did not paste them here because they were really unsuccesful.) Again, I used a 5 fold CV. The PCA doesnt really seem to make a difference.

In [46]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

X_train_f = X_train
X_test_f  = X_test
print("Shapes (features):", X_train_f.shape, X_test_f.shape)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Random Forest with PCA
pipe_rf = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("clf", RandomForestClassifier(class_weight="balanced_subsample", random_state=42))
])

param_grid = {
    "pca__n_components": [32, 64, 128],
    "clf__n_estimators": [400],
    "clf__max_depth": [None, 20],
    "clf__min_samples_leaf": [1, 2]
}

gs = GridSearchCV(pipe_rf, param_grid, scoring="f1_macro", cv=cv, n_jobs=-1, verbose=0)
gs.fit(X_train_f, y_train)

best = gs.best_estimator_
print("\n Best params:", gs.best_params_)
print("Best CV Macro-F1:", round(gs.best_score_, 3))

y_pred = best.predict(X_test_f)
print("\n RandomForest + PCA")
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Macro-F1:", round(f1_score(y_test, y_pred, average='macro'), 3))
print(classification_report(y_test, y_pred, zero_division=0))


Shapes (features): (796, 595) (199, 595)

Best params: {'clf__max_depth': 20, 'clf__min_samples_leaf': 2, 'clf__n_estimators': 400, 'pca__n_components': 64}
Best CV Macro-F1: 0.244

 RandomForest + PCA
Accuracy: 0.291
Macro-F1: 0.243
              precision    recall  f1-score   support

           0       0.21      0.21      0.21        28
           1       0.07      0.03      0.05        30
           2       0.38      0.72      0.50        40
           3       0.33      0.17      0.23        23
           4       0.43      0.37      0.40        27
           5       0.22      0.10      0.14        20
           6       0.16      0.19      0.18        31

    accuracy                           0.29       199
   macro avg       0.26      0.26      0.24       199
weighted avg       0.26      0.29      0.26       199

