In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt

# Define base paths
BASE_PATH = "widsdatathon2025-university"
TRAIN_PATH = os.path.join(BASE_PATH, "train_tsv/train_tsv")
TEST_PATH = os.path.join(BASE_PATH, "test_tsv/test_tsv")
METADATA_PATH = os.path.join(BASE_PATH, "metadata")

def load_and_process_connectome(file_path):
    """
    Load and process a single connectome matrix from TSV file
    Returns the upper triangular elements as a flat array
    """
    # Read the 200x200 correlation matrix
    matrix = pd.read_csv(file_path, sep='\t', header=None)
    
    # Get upper triangular elements (excluding diagonal)
    upper_tri = matrix.values[np.triu_indices(200, k=1)]
    
    return upper_tri

In [2]:
def extract_subject_id(filename):
    """
    Extract subject ID from the complex filename format
    Example: sub-NDARAA075AMK_ses-HBNsiteSI_task-rest_acq-VARIANTObliquity_atlas-Schaefer2018p200n17_space-MNI152NLin6ASym_reg-36Parameter_desc-PearsonNilearn_correlations.tsv
    """
    # Extract the subject ID (everything between 'sub-' and the first '_')
    subject_id = filename.split('sub-')[1].split('_')[0]
    return subject_id

In [24]:
def process_all_data(train_folder, test_folder, metadata_path):
    """
    Process all connectome matrices and merge with metadata
    """
    # Load metadata
    metadata = pd.read_csv(metadata_path)
    
    # Process training data
    train_features = []
    train_ids = []
    
    print("Processing training data...")
    for file in os.listdir(train_folder):
        if file.endswith('.tsv'):
            participant_id = extract_subject_id(file)
            file_path = os.path.join(train_folder, file)
            try:
                features = load_and_process_connectome(file_path)
                train_features.append(features)
                train_ids.append(participant_id)
            except Exception as e:
                print(f"Error processing file {file}: {str(e)}")
    
    # Convert to DataFrame with feature names
    feature_names = [f'feature_{i}' for i in range(len(train_features[0]))]
    train_df = pd.DataFrame(train_features, columns=feature_names)
    train_df['participant_id'] = train_ids
    
    # Merge with metadata
    train_final = pd.merge(train_df, metadata, on='participant_id', how='left')
    
    # Check for any missing data after merge
    missing_subjects = train_df[~train_df['participant_id'].isin(metadata['participant_id'])]
    if not missing_subjects.empty:
        print(f"Warning: {len(missing_subjects)} subjects in training data not found in metadata")
    
    return train_final, train_df

In [26]:
def train_model(train_data):
    """
    Train a Random Forest model
    """
    # Separate features and target
    feature_cols = [col for col in train_data.columns if col.startswith('feature_')]
    
    X = train_data[feature_cols]
    y = train_data['age']
    
    print(X)
    
    # Split the data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print(X_train)
    print(X_val)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    print(X_train_scaled)
    print(X_val_scaled)
    
    # # Train model
    # print("Training Random Forest model...")
    # rf_model1 = RandomForestRegressor(n_estimators=100, random_state=42)
    # rf_model1.fit(X_train_scaled, y_train)
    
    # # Make validation predictions
    # val_predictions = rf_model1.predict(X_val_scaled)
    
    # # Calculate metrics
    # mse = mean_squared_error(y_val, val_predictions)
    # r2 = r2_score(y_val, val_predictions)
    
    # print(f"\nValidation Metrics:")
    # print(f"Mean Squared Error: {mse:.2f}")
    # print(f"R² Score: {r2:.2f}")
    
    # return rf_model1, scaler

In [27]:
if __name__ == "__main__":
    # Process training data
    train_metadata_path = os.path.join(METADATA_PATH, "training_metadata.csv")
    print(f"Loading data from:\nTrain: {TRAIN_PATH}\nMetadata: {train_metadata_path}")
    
    # Process data
    train_data, dummy_variable = process_all_data(TRAIN_PATH, TEST_PATH, train_metadata_path)
    
    # Print sample of processed data
    print("\nSample of processed training data:")
    print(train_data.head())
    print("\nDataset shape:", train_data.shape)
    
    # Train model
    model, scaler = train_model(train_data)
    
    # Basic visualization of age distribution
    # plt.figure(figsize=(10, 6))
    # sns.histplot(data=train_data, x='age', bins=20)
    # plt.title('Distribution of Ages in Training Data')
    # plt.xlabel('Age')
    # plt.ylabel('Count')
    # plt.show()
    
    # # Print feature importance summary
    # feature_importance = pd.DataFrame({
    #     'feature': [f'feature_{i}' for i in range(len(model.feature_importances_))],
    #     'importance': model.feature_importances_
    # }).sort_values('importance', ascending=False)
    
    # print("\nTop 10 most important features:")
    # print(feature_importance.head(10))

Loading data from:
Train: widsdatathon2025-university\train_tsv/train_tsv
Metadata: widsdatathon2025-university\metadata\training_metadata.csv
Processing training data...

Sample of processed training data:
   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0   0.542132   0.524147   0.446293   0.356702   0.739596   0.417589   
1   0.144605   0.680939   0.742041   0.499410   0.659166   0.602953   
2   0.081803   0.489407   0.254730   0.672230   0.676404   0.568564   
3   0.269198   0.575656   0.116223   0.243510   0.469461   0.493588   
4   0.280142   0.617829   0.715562   0.532753   0.671096   0.708990   

   feature_6  feature_7  feature_8  feature_9  ...    bmi  \
0   0.463915  -0.059056   0.144390   0.092703  ...  30.72   
1   0.500101   0.169859   0.350741   0.137694  ...  16.51   
2   0.490022   0.011483   0.494971   0.282925  ...  21.33   
3   0.704042   0.393960   0.549350   0.362526  ...  26.90   
4   0.543851   0.258549   0.255543   0.116425  ...  15.77   


TypeError: cannot unpack non-iterable NoneType object

In [11]:
print(feature_importance.head(1000))

             feature  importance
12830  feature_12830    0.029647
12951  feature_12951    0.014389
13638  feature_13638    0.013442
18868  feature_18868    0.009424
8456    feature_8456    0.007965
...              ...         ...
19816  feature_19816    0.000217
302      feature_302    0.000217
1508    feature_1508    0.000217
6987    feature_6987    0.000217
19527  feature_19527    0.000216

[1000 rows x 2 columns]


In [30]:
test_metadata_path = os.path.join(METADATA_PATH, "test_metadata.csv")
print(f"Loading data from:\nTest: {TEST_PATH}\nMetadata: {test_metadata_path}")

# Process data
test_data, test_without_metadata = process_all_data(TEST_PATH, TEST_PATH, test_metadata_path)
feature_cols = [col for col in test_without_metadata.columns if col.startswith('feature_')]
test_data_final = test_without_metadata[feature_cols]

# Print sample of processed data
print("\nSample of processed test data:")
print(test_without_metadata.head())
print("\nDataset shape:", test_data_final.shape)
print("\nDataset shape:", test_without_metadata.shape)

Loading data from:
Test: widsdatathon2025-university\test_tsv/test_tsv
Metadata: widsdatathon2025-university\metadata\test_metadata.csv
Processing training data...

Sample of processed test data:
   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0   0.524855   0.484972   0.492336   0.517045   0.468494   0.553764   
1  -0.303114   0.760065   0.444488   0.705790   0.827377   0.739177   
2   0.228731   0.704443   0.517845   0.388577   0.495863   0.561603   
3   0.168970   0.709427   0.633803   0.533424   0.449741   0.528832   
4   0.071224   0.697654   0.685984   0.521219   0.621353   0.546660   

   feature_6  feature_7  feature_8  feature_9  ...  feature_19891  \
0   0.555734   0.300393   0.395799   0.263726  ...      -0.034351   
1   0.685885   0.382692   0.629805   0.256382  ...      -0.321492   
2   0.466160   0.134430   0.478563   0.306227  ...       0.325868   
3   0.511939   0.275791   0.495998  -0.017472  ...      -0.167033   
4   0.446338   0.464537   0.4813

In [33]:
test_val_scaled = scaler.transform(test_data_final)

predictions = model.predict(test_val_scaled)

print(predictions)

submission = pd.DataFrame({
        'participant_id': test_data['participant_id'],
        'age': predictions
    })
submission.to_csv('submission.csv', index=False)
print("\nSubmission file created!")
    

[ 8.8205 11.7368  9.2631  9.5679 11.2607  9.2932 12.431  12.6122 11.1979
 10.5556  9.0224 10.9309 13.3466 10.8989 10.5663  9.0259 10.5069 11.5681
  8.9958  9.9824 12.6874 10.714  10.1925 10.1536 10.9152 12.2551 11.8032
 11.1382 11.4149 11.4577 11.4135 12.6363 14.3409 10.3816 11.7148 13.4459
 10.3505  9.0738  9.872  11.0829 10.3695 12.9402 10.6944 12.23   11.4645
 12.0417 11.7404 10.9123 13.3144 14.3862 10.2911 10.461  12.7015 11.326
 12.6212 14.2506 11.8109 10.1    12.7449 11.7767 12.5528 11.9632 12.8074
 13.4204 12.1519 11.2426 13.4902 11.5951 11.7075  9.8124 10.5314 12.6604
 10.0044  9.767   9.5817 12.2085 10.8919 14.2702 10.758  13.2993 11.9148
 11.3889  9.9011  9.7755  9.2363 12.0874 10.7129 10.6985 13.1854 10.7537
 10.6067 12.2265 12.924  12.0683 12.6653 10.201  11.8186 10.6409  9.4403
 12.2265 10.6412 10.4005 11.478  10.6616 11.0802 12.5067 11.8907 13.2417
  8.8654 11.5826  9.8816  9.7878 14.049  11.1997 14.2921 10.5248 12.9165
 13.365  11.9424 11.7311 11.9935 11.2721  9.7676 12.