In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef, cohen_kappa_score

In [None]:
# Load the dataset
df = pd.read_csv('ASD_Traits_Study_Data.csv')

# Define features (X) and target variable (y)
X = df.drop(columns=['ASD_traits'])
y = df['ASD_traits']

In [None]:
# Identify categorical features
category_cols = X.select_dtypes(include=[object, 'category']).columns.tolist()

# Split data into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Define the parameter grid for CatBoost hyperparameter tuning
grid = {
    'learning_rate': [0.03, 0.1],
    'depth': [4, 6, 10],
    'l2_leaf_reg': [1, 3, 5, 7, 9]
}

In [None]:
# Function to handle data imbalance
def handle_imbalance(X, y, technique="none"):
    """
    Handle data imbalance using the specified technique.
    :param X: Feature matrix
    :param y: Target variable
    :param technique: Resampling technique ('oversample', 'undersample', 'smote', 'borderline-smote', 
                      'adasyn', 'near-miss', 'tomek', 'smoteenn', 'smotetomek', or 'none')
    :return: Resampled feature matrix and target variable
    """
    if technique == "oversample":
        resampler = RandomOverSampler(random_state=42)
    elif technique == "undersample":
        resampler = RandomUnderSampler(random_state=42)
    elif technique == "smote":
        resampler = SMOTE(random_state=42)
    elif technique == "borderline-smote":
        resampler = BorderlineSMOTE(random_state=42)
    elif technique == "adasyn":
        resampler = ADASYN(random_state=42)
    elif technique == "near-miss":
        resampler = NearMiss()
    elif technique == "tomek":
        resampler = TomekLinks()
    elif technique == "smoteenn":
        resampler = SMOTEENN(random_state=42)
    elif technique == "smotetomek":
        resampler = SMOTETomek(random_state=42)
    else:  # No resampling
        return X, y
    
    X_resampled, y_resampled = resampler.fit_resample(X, y)
    return X_resampled, y_resampled

In [None]:
# Perform cross-validation with imbalance handling
imbalance_methods = ["none", "oversample", "undersample", "smote", "borderline-smote", 
                     "adasyn", "near-miss", "tomek", "smoteenn", "smotetomek"]
cv_results = []

In [None]:
print("Starting grid search with all imbalance handling techniques...")
for method in imbalance_methods:
    print(f"\nHandling imbalance using: {method}")
    
    # Step 1: Apply resampling to the entire training set
    X_train_resampled, y_train_resampled = handle_imbalance(X_train, y_train, technique=method)
    
    # Step 2: Initialize CatBoostClassifier
    model = CatBoostClassifier(cat_features=category_cols, random_state=42, verbose=0)
    
    # Step 3: Perform grid search on the resampled data
    grid_search = GridSearchCV(estimator=model, param_grid=grid, scoring='f1', cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train_resampled, y_train_resampled)
    
    # Step 4: Evaluate the best model on the test set
    best_model = grid_search.best_estimator_
    y_test_pred = best_model.predict(X_test)
    y_test_prob = best_model.predict_proba(X_test)[:, 1]
    
    # Step 5: Calculate performance metrics
    test_metrics = {
        'Imbalance Method': method,
        'Best Parameters': grid_search.best_params_,
        'Accuracy': accuracy_score(y_test, y_test_pred),
        'Precision': precision_score(y_test, y_test_pred),
        'Recall': recall_score(y_test, y_test_pred),
        'F1-Score': f1_score(y_test, y_test_pred),
        'ROC-AUC': roc_auc_score(y_test, y_test_prob),
        'MCC': matthews_corrcoef(y_test, y_test_pred),
        'Cohen Kappa': cohen_kappa_score(y_test, y_test_pred),
    }
    cv_results.append(test_metrics)
    print(f"Metrics for {method}: {test_metrics}")
    print("-" * 40)

In [None]:
# Convert results to DataFrame and save
cv_results_df = pd.DataFrame(cv_results)
cv_results_df.to_csv('imbalance_cv_results.csv', index=False)
print("\nCross-Validation Results Saved to 'imbalance_cv_results.csv'.")

In [None]:
# Evaluate the best model for each method on the test set
test_results = []
for method in imbalance_methods:
    print(f"\nEvaluating the best model from {method} on the test set...")
    X_train_resampled, y_train_resampled = handle_imbalance(X_train, y_train, technique=method)
    
    # Train the best model on the entire resampled training data
    best_model.fit(X_train_resampled, y_train_resampled)
    y_test_pred = best_model.predict(X_test)
    y_test_prob = best_model.predict_proba(X_test)[:, 1]

    # Calculate test metrics
    test_metrics = {
        'Imbalance Method': method,
        'Best Parameters': best_params,
        'Accuracy': accuracy_score(y_test, y_test_pred),
        'Precision': precision_score(y_test, y_test_pred),
        'Recall': recall_score(y_test, y_test_pred),
        'F1-Score': f1_score(y_test, y_test_pred),
        'ROC-AUC': roc_auc_score(y_test, y_test_prob),
        'MCC': matthews_corrcoef(y_test, y_test_pred),
        'Cohen Kappa': cohen_kappa_score(y_test, y_test_pred),
    }
    test_results.append(test_metrics)

In [None]:
# Save test set metrics
test_results_df = pd.DataFrame(test_results)
test_results_df.to_csv('imbalance_test_results.csv', index=False)
print("\nTest Set Results Saved to 'imbalance_test_results.csv'.")