In [2]:
import pandas as pd
import numpy as np
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [3]:
from pathlib import Path
DATA_DIR = Path("../../data/clean_data_splits")

In [4]:
# Load PCA-reduced features and targets
X_train = pd.read_csv(DATA_DIR/"X_train_pca.csv")
X_val   = pd.read_csv(DATA_DIR/"X_val_pca.csv")
X_test  = pd.read_csv(DATA_DIR/"X_test_pca.csv")

y_train = pd.read_csv(DATA_DIR/"y_train.csv")  # -1 for unlabeled
y_val        = pd.read_csv(DATA_DIR/"y_val.csv")
y_test       = pd.read_csv(DATA_DIR/"y_test.csv")

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")


X_train: (1611, 20), y_train: (1611, 3)


In [5]:
# Define SSL methods
ssl_methods = {
    "LabelPropagation": LabelPropagation(kernel='rbf', gamma=20, max_iter=1000),
    "LabelSpreading": LabelSpreading(kernel='rbf', gamma=20, alpha=0.2, max_iter=1000),
}

In [None]:
# Train each method on all targets
y_train_pred = {}  # to store predicted labels for each method
y_val_pred   = {}
y_test_pred  = {}

for method_name, model in ssl_methods.items():
    print(f"\n===== Training with {method_name} =====\n")
    
    # Store predictions for each target
    y_train_pred[method_name] = pd.DataFrame(index=y_train.index, columns=y_train.columns)
    y_val_pred[method_name]   = pd.DataFrame(index=y_val.index, columns=y_val.columns)
    y_test_pred[method_name]  = pd.DataFrame(index=y_test.index, columns=y_test.columns)
    
    for target in y_train.columns:
        print(f"Training target: {target}")
        
        # Prepare target vector: -1 for unlabeled, valid labels for labeled
        y_target = y_train[target].values
        
        # Fit the SSL model
        model.fit(X_train.values, y_target)
        
        # Predict for train (filled labels), val, test
        y_train_pred[method_name][target] = model.predict(X_train.values)
        y_val_pred[method_name][target]   = model.predict(X_val.values)
        y_test_pred[method_name][target]  = model.predict(X_test.values)
        
    print(f"Finished training {method_name}.\n")

# -------------------------------
# 5️⃣ Save SSL-predicted datasets
# -------------------------------
for method_name in ssl_methods.keys():
    y_train_pred[method_name].to_csv(f"data_splits/y_train_{method_name}.csv", index=False)
    y_val_pred[method_name].to_csv(f"data_splits/y_val_{method_name}.csv", index=False)
    y_test_pred[method_name].to_csv(f"data_splits/y_test_{method_name}.csv", index=False)

print("All SSL-predicted datasets saved successfully.")
