In [None]:
# Models to try based on the results
# 1. Random Forest Classifier
# 2. LightGBM Classifier
# 3. XGBoost Classifier
# 4. LinearDiscriminantAnalysis
# 5. KNN?

import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import cross_validate, LeaveOneOut
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, make_scorer, precision_score, recall_score, f1_score

# Models
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, make_scorer, precision_score, recall_score, f1_score


In [None]:
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_is_fitted, validate_data

class BaselineModel(BaseEstimator):
    def fit(self, X, y):
        # Ensure y is 1D
        y = np.ravel(y)
        self._majority_class = pd.Series(y).mode()[0]
        self._is_fitted = True
        return self

    def predict(self, X):
        check_is_fitted(self, '_is_fitted')
        return np.full(shape=(X.shape[0],), fill_value=self._majority_class)
   

In [None]:


def train_evaluate_classifiers(X_train, y_train, X_test, y_test, random_state=42):
    """
    Train and evaluate multiple classifiers using 5-fold cross validation
    
    Parameters:
    -----------
    X_train : pd.DataFrame
        Training features
    y_train : pd.Series
        Training labels
    X_test : pd.DataFrame
        Test features
    y_test : pd.Series
        Test labels
    random_state : int
        Random seed for reproducibility
        
    Returns:
    --------
    dict : Dictionary containing results for each classifier
    """
    
    """
    # Define classifiers
    old_classifiers = {
        "RandomForest": {
            "model": RandomForestClassifier(random_state=random_state),
            "params": {
                'n_estimators': np.arange(100, 1000, 100),
                'max_depth': np.arange(3, 11, dtype=int),
                'min_samples_split': np.linspace(0.1, 1.0, 10),
                'min_samples_leaf': np.linspace(0.1, 0.5, 5),
                'max_features': ['sqrt', 'log2'],
                'bootstrap': [True, False],
            }
        },
        "GradientBoosting": {
            "model": GradientBoostingClassifier(random_state=random_state),
            "params": {
                'n_estimators': np.arange(100, 1000, 100),
                'learning_rate': np.linspace(0.01, 0.3, 10),
                'max_depth': np.arange(3, 11, dtype=int),
                'min_samples_split': np.linspace(0.1, 1.0, 10),
                'min_samples_leaf': np.linspace(0.1, 0.5, 5),
                'max_features': ['sqrt', 'log2'],
            }
        },
        "XGBoost": {
            "model": XGBClassifier(
                random_state=random_state, 
                eval_metric='mlogloss',

            ),
            "params": {
                'max_depth': np.linspace(3, 11, dtype=int),
                'learning_rate': np.linspace(0.1, 0.3, 10),
                'n_estimators': np.arange(100, 1000, 100),
                'gamma': np.linspace(0, 5, 6),
            }
        },
        "$k$NN": {
            "model": KNeighborsClassifier(),
            "params": {
                'n_neighbors': np.arange(1, 31),
                'weights': ['uniform', 'distance'],
                'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                'leaf_size': np.arange(10, 100, 10),
                "p": [1, 2]
            }
        },
        "LGBM": {
            "model": LGBMClassifier(random_state=random_state, verbosity=-1, verbose=-1),
            "params": {
                'num_leaves': np.arange(20, 130, 10),
                'reg_alpha': [0.1, 0.2, 0.3, 0.4, 0.5],
                'lambda_l1': [0, 1, 1.5],
                'lambda_l2': [0, 1],
                }
        },
        "Decision Tree": {
            "model": DecisionTreeClassifier(random_state=random_state),
            "params": {
                'max_depth': np.arange(4, 11, dtype=int),
                'min_samples_split': [0.1, 0.2, 0.3, 0.4, 0.5],
                'min_samples_leaf': [0.1, 0.2, 0.3, 0.4, 0.5],
                'criterion': ['gini', 'log_loss'],
            }
        },
        "Baseline": {
            "model": BaselineModel(),
            "params": {}
        },
    }
    """
    
    # Define classifiers with reduced hyperparameters
    classifiers = {
        "RandomForest": {
            "model": RandomForestClassifier(random_state=random_state),
            "params": {
                'n_estimators': [100, 200, 500],
                'max_depth': [None, 3, 5, 7],
                'min_samples_split': [2, 5],
                'min_samples_leaf': [1, 2],
            }
        },
        "GradientBoosting": {
            "model": GradientBoostingClassifier(random_state=random_state),
            "params": {
                'n_estimators': [100, 200, 500],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7],
            }
        },
        "XGBoost": {
            "model": XGBClassifier(
                random_state=random_state, 
                eval_metric='mlogloss',
            ),
            "params": {
                'max_depth': [3, 5, 7],
                'learning_rate': [0.1, 0.2],
                'n_estimators': [100, 200, 500],
            }
        },
        "$k$NN": {
            "model": KNeighborsClassifier(),
            "params": {
                'n_neighbors': [3, 5, 10, 20],
                'weights': ['uniform', 'distance'],
            }
        },
        "LGBM": {
            "model": LGBMClassifier(random_state=random_state, verbose=-1),
            "params": {
                'num_leaves': [30, 50, 100, 150],
                'reg_alpha': [0.1, 0.5],
                'lambda_l1': [0, 1],
            }
        },
        "Decision Tree": {
            "model": DecisionTreeClassifier(random_state=random_state),
            "params": {
                'max_depth': [None, 5, 10],
                'min_samples_split': [2, 5],
                'criterion': ['gini', 'entropy'],
            }
        },
        "MLPClassifier": {
            "model": MLPClassifier(random_state=random_state, max_iter=1000),
            "params": {
                'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
                'activation': ['relu', 'tanh', 'logistic'],
                'alpha': [0.0001, 0.001, 0.01],
                'learning_rate': ['constant', 'adaptive'],
            }
        },
        "Baseline": {
            "model": BaselineModel(),
            "params": {}
        },
    }
    
    # Define custom scoring functions with zero_division=0
    def precision_scorer(y_true, y_pred):
        return precision_score(y_true, y_pred, average='macro', zero_division=0)

    def recall_scorer(y_true, y_pred):
        return recall_score(y_true, y_pred, average='macro', zero_division=0)

    def f1_scorer(y_true, y_pred):
        return f1_score(y_true, y_pred, average='macro', zero_division=0)

    TRIALS = 10

    # Create scorers
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision_macro': make_scorer(precision_score, average='macro'),
        'recall_macro': make_scorer(recall_score, average='macro'),
        'f1_macro': make_scorer(f1_score, average='macro')
    }

    results = {}
    
    for name, clf_dict in classifiers.items():
        print(f"Training {name} classifier...")
        # Start timing
        start_time = time()
        
        # Get the model from the dictionary
        clf = clf_dict['model']
        
        # For small datasets or datasets with tiny classes
        if X_train.shape[0] < 500:
            cv = LeaveOneOut()  # Use LOO for very small datasets
        else:
            # Count samples in smallest class
            class_counts = y_train.value_counts()
            min_class_samples = class_counts.min()
            

            n_splits = min(5, min_class_samples)  # Use at most 5 splits, but no more than samples in smallest class
            print(f"Using {n_splits} splits due to small class size ({min_class_samples} samples in smallest class)")
            cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

            # # Choose appropriate number of splits based on smallest class
            # if min_class_samples < 10:
            #     n_splits = min(5, min_class_samples)  # Use at most 5 splits, but no more than samples in smallest class
            #     print(f"Using {n_splits} splits due to small class size ({min_class_samples} samples in smallest class)")
            #     cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
            # else:
            #     cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
        
        # Perform cross-validation with random search
        cv_results = RandomizedSearchCV(clf, 
                                        clf_dict['params'], 
                                        cv=cv, 
                                        scoring=scoring, 
                                        refit='f1_macro', 
                                        n_iter=TRIALS,
                                        n_jobs=-1, # todo
                                        random_state=random_state, 
                                        verbose=4)
        cv_results.fit(X_train, y_train.values.ravel())
        
        # Fit on full training data and evaluate on test set
        best_clf = cv_results.best_estimator_
        test_pred_encoded = best_clf.predict(X_test)

        test_scores = precision_recall_fscore_support(y_test.values, test_pred_encoded, average='macro', zero_division=0)
        test_accuracy = accuracy_score(y_test.values.ravel(), test_pred_encoded)
        
        # Store results
        results[name] = {
            'cv_accuracy': cv_results.cv_results_['mean_test_accuracy'].mean(),
            'cv_accuracy_std': cv_results.cv_results_['std_test_accuracy'].mean(),
            'cv_precision': cv_results.cv_results_['mean_test_precision_macro'].mean(),
            'cv_recall': cv_results.cv_results_['mean_test_recall_macro'].mean(),
            'cv_f1': cv_results.cv_results_['mean_test_f1_macro'].mean(),
            'test_accuracy': test_accuracy,
            'test_precision': test_scores[0],
            'test_recall': test_scores[1],
            'test_f1': test_scores[2],
            'training_time': time() - start_time,
            'fitted_model': best_clf,
            'best_params': cv_results.best_params_
        }
        
        print(f"DONE: {results[name]}")
    
    return results

RESULTS = {}

# Train and test the model for all datasets
approaches = {'Initial': ['real_data', 'synth_data'], 'Extra': ['real_data', 'real_pseudoreal_data', 'real_pseudoreal_synth_data']}

for approach, datasets in approaches.items():
    for dataset in datasets:
        # Load the preprocessed data
        X_train = pd.read_csv(f"./datasets/preprocessed/{approach}/{dataset}/X_train.csv")
        y_train = pd.read_csv(f"./datasets/preprocessed/{approach}/{dataset}/y_train.csv")
        X_test = pd.read_csv(f"./datasets/preprocessed/{approach}/{dataset}/X_test.csv")
        y_test = pd.read_csv(f"./datasets/preprocessed/{approach}/{dataset}/y_test.csv")
        

        print(f"\nEvaluating on {approach}_{dataset} dataset:")

        # Train and evaluate
        results = train_evaluate_classifiers(X_train, y_train, X_test, y_test)
        
        # Store results
        RESULTS[f"{approach}_{dataset}"] = results

        # Print results
        for clf_name, metrics in results.items():
            print(f"\n{clf_name}:")
            print(f"CV Accuracy: {metrics['cv_accuracy']:.3f} (±{metrics['cv_accuracy_std']:.3f})")
            print(f"Test Accuracy: {metrics['test_accuracy']:.3f}")
            print(f"Test F1-Score: {metrics['test_f1']:.3f}")
            print(f"Training Time: {metrics['training_time']:.2f} seconds")

        results_df = pd.DataFrame.from_dict(results, orient='index')
        print(results_df)



In [None]:
# Save results to global variable
import pickle
with open("tuned-results.pkl", "wb") as f:
    pickle.dump(RESULTS, f)

In [None]:
for dataset_name, dataset_results in RESULTS.items():
    print(f"Results for {dataset_name.capitalize()} Dataset:\n")
    for clf_name, metrics in dataset_results.items():
        print(f"Classifier: {clf_name}")
        print(f"  CV Accuracy: {metrics['cv_accuracy']:.3f} (±{metrics['cv_accuracy_std']:.3f})")
        print(f"  CV Precision: {metrics['cv_precision']:.3f}")
        print(f"  CV Recall: {metrics['cv_recall']:.3f}")
        print(f"  CV F1-Score: {metrics['cv_f1']:.3f}")
        print(f"  Test Accuracy: {metrics['test_accuracy']:.3f}")
        print(f"  Test F1-Score: {metrics['test_f1']:.3f}")
        print(f"  Training Time: {metrics['training_time']:.2f} seconds")
        print(f"  Best Parameters: {metrics['best_params']}")
        print()

In [None]:
markdown_text = "# Hyperparameter-Tuned Classifier Evaluation Results\n\n"
markdown_text += """
> The following table shows the results of the baseline classifiers on the real and synthetic datasets.
> The results are based on 5-fold cross-validation with default parameters.
> The results are displayed in descending order of F1-Score.

"""

# make a markdown table from the results
for dataset_name, results in RESULTS.items():
    markdown_text += f"## {dataset_name.capitalize()} Dataset\n\n"
    
    # Create table header
    markdown_text += "| Classifier | CV Accuracy | Test Accuracy | Test F1-Score | Training Time (s) | Efficiency (F1/s) |\n"
    markdown_text += "|------------|-------------|---------------|---------------|------------------| ------------------ |\n"
    
    # Order by f1-score
    results_ord = {k: v for k, v in sorted(results.items(), key=lambda item: item[1]['test_f1'], reverse=True)}

    # Add each classifier's results as a row
    for clf_name, metrics in results_ord.items():
        training_time_to_f1_ratio = metrics['test_f1'] / metrics['training_time']
        markdown_text += f"| {clf_name} | {metrics['cv_accuracy']:.3f} (±{metrics['cv_accuracy_std']:.3f}) | "
        markdown_text += f"{metrics['test_accuracy']:.3f} | {metrics['test_f1']:.3f} | {metrics['training_time']:.2f} | {training_time_to_f1_ratio:.2f} |\n"
    markdown_text += "\n"  # Add space between dataset tables

# Write to file
with open('tuning_results.md', 'w') as f:
    f.write(markdown_text)

print("Results have been written to tuning_results.md")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Load the test labels
y_test = DATASETS["real+synthetic"]["y"]["test"]

# LabelEncoder initialisieren und fitten
label_encoder = LabelEncoder()
label_encoder.fit([
    'Insufficient_Weight',
    'Normal_Weight',
    'Overweight_Level_I',
    'Overweight_Level_II',
    'Obesity_Type_I',
    'Obesity_Type_II',
    'Obesity_Type_III'
])

# Zielklassen im Testset dekodieren
y_test_named = pd.Series(label_encoder.inverse_transform(y_test), name="Klasse")

# Plot
plt.figure(figsize=(10, 5))
sns.countplot(x=y_test_named, order=y_test_named.value_counts().index)
plt.title("Verteilung der Zielklassen im Testset")
plt.xticks(rotation=45)
plt.xlabel("Zielklasse")
plt.ylabel("Anzahl")
plt.tight_layout()
plt.show()


