In [1]:
from mmit_functions import mmit
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import os

In [2]:
folder_path = '../../data'
datasets = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]

In [None]:
# train set and test set
for dataset in datasets:
    folds_df    = pd.read_csv(f'../../data/{dataset}/folds.csv')
    features_df = pd.read_csv(f'../../data/{dataset}/features.csv').astype(np.float32)
    target_df   = pd.read_csv(f'../../data/{dataset}/targets.csv').astype(np.float32)

    # Cross-validation loop
    for test_fold in sorted(np.unique(folds_df['fold'])):
        train_indices = folds_df[folds_df['fold'] != test_fold].index
        test_indices = folds_df[folds_df['fold'] == test_fold].index

        # Filter the DataFrames by index
        X_train = features_df.loc[train_indices].values  
        X_test = features_df.loc[test_indices].values  
        y_train = target_df.loc[train_indices].values  

        # Perform 3-fold cross-validation on the training set
        kf = KFold(n_splits=3, shuffle=True, random_state=42)
        max_depths = [5, 10, 15, 20, 25, 30]
        best_models = []

        for train_idx, val_idx in kf.split(X_train):
            X_subtrain, X_val = X_train[train_idx], X_train[val_idx]
            y_subtrain, y_val = y_train[train_idx], y_train[val_idx]

            best_model = None
            best_hinge_error = float('inf')

            # Train models with different max_depth values
            for max_depth in max_depths:
                tree = mmit(max_depth=max_depth)
                tree.fit(X_subtrain, y_subtrain)

                # Predict on validation set
                y_val_pred = tree.predict(X_val)

                # Compute hinge error for validation set
                y_val_low = y_val[:, 0] + tree.margin_length
                y_val_up = y_val[:, 1] - tree.margin_length
                hinge_error = np.sum(tree.hinge_error(y_val_pred, y_val_low, y_val_up))

                # Track the best model based on hinge error
                if hinge_error < best_hinge_error:
                    best_hinge_error = hinge_error
                    best_model = tree

            # Store the best model for this subtrain/val pair
            best_models.append(best_model)

        # Predict on the test set using the average prediction from the 3 best models
        target_mat_pred = np.mean([model.predict(X_test) for model in best_models], axis=0)
        prediction = pd.DataFrame({'pred': target_mat_pred})
        prediction.to_csv(f"predictions/{dataset}.{test_fold}.csv", index=False)

In [None]:
from mmit_functions import mmit
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import os
from joblib import Parallel, delayed  # For parallel processing

folder_path = '../../data'
datasets = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]

def process_dataset(dataset):
    # Load data
    folds_df = pd.read_csv(f'../../data/{dataset}/folds.csv')
    features_df = pd.read_csv(f'../../data/{dataset}/features.csv').astype(np.float32)
    target_df = pd.read_csv(f'../../data/{dataset}/targets.csv').astype(np.float32)

    max_depths = [5, 10, 15, 20, 25, 30]
    kf = KFold(n_splits=3, shuffle=True, random_state=42)

    # Iterate over test folds
    for test_fold in sorted(np.unique(folds_df['fold'])):
        train_indices = folds_df[folds_df['fold'] != test_fold].index
        test_indices = folds_df[folds_df['fold'] == test_fold].index

        X_train = features_df.loc[train_indices].values
        X_test = features_df.loc[test_indices].values
        y_train = target_df.loc[train_indices].values

        def train_and_evaluate(train_idx, val_idx):
            X_subtrain, X_val = X_train[train_idx], X_train[val_idx]
            y_subtrain, y_val = y_train[train_idx], y_train[val_idx]

            best_model = None
            best_hinge_error = float('inf')

            for max_depth in max_depths:
                tree = mmit(max_depth=max_depth)
                tree.fit(X_subtrain, y_subtrain)

                # Predict on validation set
                y_val_pred = tree.predict(X_val)

                # Compute hinge error
                y_val_low = y_val[:, 0] + tree.margin_length
                y_val_up = y_val[:, 1] - tree.margin_length
                hinge_error = np.sum(tree.hinge_error(y_val_pred, y_val_low, y_val_up))

                if hinge_error < best_hinge_error:
                    best_hinge_error = hinge_error
                    best_model = tree

            return best_model

        # Train models in parallel
        best_models = Parallel(n_jobs=-1)(
            delayed(train_and_evaluate)(train_idx, val_idx) for train_idx, val_idx in kf.split(X_train)
        )

        # Predict on the test set using the average prediction
        target_mat_pred = np.mean([model.predict(X_test) for model in best_models], axis=0)
        prediction = pd.DataFrame({'pred': target_mat_pred})
        prediction.to_csv(f"predictions/{dataset}.{test_fold}.csv", index=False)

# Process all datasets
Parallel(n_jobs=-1)(delayed(process_dataset)(dataset) for dataset in datasets)