In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
# Define paths
BASE_PATH = "widsdatathon2025-university"
TRAIN_PATH = os.path.join(BASE_PATH, "train_tsv/train_tsv")
TEST_PATH = os.path.join(BASE_PATH, "test_tsv/test_tsv")
METADATA_PATH = os.path.join(BASE_PATH, "metadata")
DATA_PATH = "data"  # New folder for processed data

# Create data directory if it doesn't exist
os.makedirs(DATA_PATH, exist_ok=True)

In [2]:
def extract_subject_id(filename):
    """
    Extract subject ID from the complex filename format
    Example: sub-NDARAA075AMK_ses-HBNsiteSI_task-rest_acq-VARIANTObliquity_atlas-Schaefer2018p200n17_space-MNI152NLin6ASym_reg-36Parameter_desc-PearsonNilearn_correlations.tsv
    """
    # Extract the subject ID (everything between 'sub-' and the first '_')
    subject_id = filename.split('sub-')[1].split('_')[0]
    return subject_id

In [3]:
def preprocess_data(data, encoders=None, is_training=True):
    """
    Comprehensive preprocessing including categorical variables
    """
    if encoders is None:
        encoders = {}
    
    # Handle categorical columns
    categorical_cols = ['sex', 'study_site', 'ethnicity', 'race', 'handedness', 
                       'parent_1_education', 'parent_2_education']
    
    for col in categorical_cols:
        if col in data.columns:
            if is_training:
                # Create new encoder for training data
                encoders[col] = LabelEncoder()
                data[col] = encoders[col].fit_transform(data[col].fillna('missing'))
            else:
                # Use existing encoder for test data
                data[col] = encoders[col].transform(data[col].fillna('missing'))
    
    # Handle numerical missing values
    numerical_cols = ['bmi', 'p_factor_fs', 'internalizing_fs', 'externalizing_fs', 'attention_fs']
    for col in numerical_cols:
        if col in data.columns:
            data[col] = data[col].fillna(data[col].mean())
    
    return data, encoders

In [4]:
def load_and_process_connectome(file_path):
    """Load and process a single connectome matrix"""
    matrix = pd.read_csv(file_path, sep='\t', header=None)
    upper_tri = matrix.values[np.triu_indices(200, k=1)]
    return upper_tri

In [5]:
def process_dataset(folder_path, metadata_path=None):
    """Process dataset and save to file"""
    features = []
    subject_ids = []
    
    print("Processing connectome matrices...")
    for file in os.listdir(folder_path):
        if file.endswith('.tsv'):
            subject_id = extract_subject_id(file)
            file_path = os.path.join(folder_path, file)
            try:
                connectome_features = load_and_process_connectome(file_path)
                features.append(connectome_features)
                subject_ids.append(subject_id)
            except Exception as e:
                print(f"Error processing {file}: {str(e)}")
    
    # Create DataFrame
    feature_names = [f'feature_{i}' for i in range(len(features[0]))]
    df = pd.DataFrame(features, columns=feature_names)
    df['participant_id'] = subject_ids
    
    # Merge with metadata if provided
    if metadata_path:
        metadata = pd.read_csv(metadata_path)
        df = pd.merge(df, metadata, on='participant_id', how='left')
    
    return df

In [6]:

def train_multiple_models(X_train, X_val, y_train, y_val):
    """Train and evaluate multiple models"""
    models = {
        # 'RandomForest': {
        #     'model': RandomForestRegressor(random_state=42),
        #     'params': {
        #         'n_estimators': [1000, 200],
        #         'max_depth': [None, 10, 20],
        #         'min_samples_split': [2, 5]
        #     }
        # },
        # 'GradientBoosting': {
        #     'model': GradientBoostingRegressor(random_state=42),
        #     'params': {
        #         'n_estimators': [1000, 200],
        #         'learning_rate': [0.01, 0.1],
        #         'max_depth': [3, 5]
        #     }
        # },
        # 'XGBoost': {
        #     'model': XGBRegressor(random_state=42),
        #     'params': {
        #         'n_estimators': [1000, 200],
        #         'learning_rate': [0.01, 0.1],
        #         'max_depth': [3, 5]
        #     }
        # },
        'Lasso': {
            'model': LassoCV(random_state=42),
            'params': {}
        },
        'Ridge': {
            'model': RidgeCV(),
            'params': {}
        }
    }
    
    results = {}
    
    for name, model_info in models.items():
        print(f"\nTraining {name}...")
        model = model_info['model']
        params = model_info['params']
        
        if params:  # If there are parameters to tune
            grid_search = GridSearchCV(model, params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            print(f"Best parameters for {name}: {grid_search.best_params_}")
        else:
            best_model = model
            best_model.fit(X_train, y_train)
        
        # Make predictions
        train_pred = best_model.predict(X_train)
        val_pred = best_model.predict(X_val)
        
        # Calculate metrics
        results[name] = {
            'model': best_model,
            'train_r2': r2_score(y_train, train_pred),
            'val_r2': r2_score(y_val, val_pred),
            'train_mse': mean_squared_error(y_train, train_pred),
            'val_mse': mean_squared_error(y_val, val_pred),
            'train_mae': mean_absolute_error(y_train, train_pred),
            'val_mae': mean_absolute_error(y_val, val_pred)
        }
        
        print(f"{name} Results:")
        print(f"Validation R²: {results[name]['val_r2']:.4f}")
        print(f"Validation MSE: {results[name]['val_mse']:.4f}")
        print(f"Validation MAE: {results[name]['val_mae']:.4f}")
    
    return results

In [7]:
def plot_model_comparison(results):
    """Plot comparison of model performances"""
    metrics_df = pd.DataFrame({
        'Model': list(results.keys()),
        'Validation R²': [results[model]['val_r2'] for model in results],
        'Validation MSE': [results[model]['val_mse'] for model in results]
    })
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    sns.barplot(data=metrics_df, x='Model', y='Validation R²', ax=ax1)
    ax1.set_title('Model Comparison - R²')
    ax1.tick_params(axis='x', rotation=45)
    
    sns.barplot(data=metrics_df, x='Model', y='Validation MSE', ax=ax2)
    ax2.set_title('Model Comparison - MSE')
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig(os.path.join(DATA_PATH, 'model_comparison.png'))
    plt.close()

In [8]:
if __name__ == "__main__":
    # #TRAIN DATA
    # Process training data
    print("Processing training data...")
    train_metadata_path = os.path.join(METADATA_PATH, "training_metadata.csv")
    train_data = process_dataset(TRAIN_PATH, train_metadata_path)
    
    # Save processed data
    train_data.to_csv(os.path.join(DATA_PATH, 'processed_train_data.csv'), index=False)
    print(f"Processed data saved to {DATA_PATH}")
    
    # #TEST DATA
    # Process testing data
    test_metadata_path = os.path.join(METADATA_PATH, "test_metadata.csv")
    test_data = process_dataset(TEST_PATH, test_metadata_path)
    
    # Save processed data
    test_data.to_csv(os.path.join(DATA_PATH, 'processed_test_data.csv'), index=False)
    print(f"Processed data saved to {DATA_PATH}")
    
    # Prepare data for modeling
    feature_cols = [col for col in train_data.columns if col not in ['participant_id', 'age']]
    X = train_data[feature_cols]
    y = train_data['age']
    
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Preprocess data
    X_train, encoders = preprocess_data(X_train, is_training=True)
    X_val, _ = preprocess_data(X_val, encoders=encoders, is_training=False)
    
    # Train and evaluate multiple models
    results = train_multiple_models(X_train, X_val, y_train, y_val)
    
    # Plot model comparison
    plot_model_comparison(results)
    
    # Save best model
    best_model_name = max(results.keys(), key=lambda k: results[k]['val_r2'])
    best_model = results[best_model_name]['model']
    joblib.dump(best_model, os.path.join(DATA_PATH, 'best_model.joblib'))
    joblib.dump(encoders, os.path.join(DATA_PATH, 'encoders.joblib'))
    
    print(f"\nBest performing model: {best_model_name}")
    print(f"Best model saved to {DATA_PATH}/best_model.joblib")
    
    # Feature importance for tree-based models
    if hasattr(best_model, 'feature_importances_'):
        importance_df = pd.DataFrame({
            'feature': feature_cols,
            'importance': best_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        plt.figure(figsize=(12, 6))
        sns.barplot(data=importance_df.head(20), x='importance', y='feature')
        plt.title('Top 20 Most Important Features')
        plt.tight_layout()
        plt.savefig(os.path.join(DATA_PATH, 'feature_importance.png'))
        plt.close()

Processing training data...
Processing connectome matrices...
Processed data saved to data
Processing connectome matrices...
Processed data saved to data

Training RandomForest...
Best parameters for RandomForest: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 1000}
RandomForest Results:
Validation R²: 0.4806
Validation MSE: 4.9538
Validation MAE: 1.8486

Training GradientBoosting...
Best parameters for GradientBoosting: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 1000}
GradientBoosting Results:
Validation R²: 0.5507
Validation MSE: 4.2853
Validation MAE: 1.7142

Training XGBoost...


AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [9]:
print(results)

NameError: name 'results' is not defined