In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import glob
from tqdm import tqdm

In [3]:
def extract_upper_triangular(matrix_file):
    """
    Extract upper triangular portion from correlation matrix, excluding diagonal
    
    Parameters:
    matrix_file (str): Path to the TSV file containing correlation matrix
    
    Returns:
    numpy.ndarray: Flattened upper triangular portion
    str: participant_id extracted from filename
    """
    # Read TSV file
    matrix = pd.read_csv(matrix_file, sep='\t', header=None).values

    rows, cols = matrix.shape
    if rows != cols:
        raise ValueError(f"Matrix must be square. Got shape: {matrix.shape}")
    
    # Get participant ID from filename
    participant_id = Path(matrix_file).stem.split('_')[0].split('-')[1]
    
    # Get upper triangular indices
    rows, cols = np.triu_indices(len(matrix), k=0)
    
    # Extract upper triangular values
    upper_triangular = matrix[rows, cols]
    
    return upper_triangular, participant_id

In [4]:
def process_connectome_folder(folder_path):
    """
    Process all TSV files in a folder to create a DataFrame with correlation vectors
    
    Parameters:
    folder_path (str): Path to folder containing TSV files
    
    Returns:
    pandas.DataFrame: DataFrame with participant_id and correlation features
    """
    all_data = []
    
    # Get all TSV files in the folder
    tsv_files = glob.glob(f"{folder_path}/*.tsv")
    
    for tsv_file in tqdm(tsv_files):
        correlations, participant_id = extract_upper_triangular(tsv_file)
        
        # Create a dictionary with participant_id and correlations
        data_dict = {'participant_id': participant_id}
        # Add correlations as features
        for i, corr in enumerate(correlations):
            data_dict[f'correlation_{i}'] = corr
            
        all_data.append(data_dict)
    
    # Convert to DataFrame
    return pd.DataFrame(all_data)

In [5]:
def process_dataset(connectome_folder, metadata_file):
    """
    Process entire dataset including training and test sets
    
    Parameters:
    train_folder (str): Path to training TSV files folder
    test_folder (str): Path to test TSV files folder
    metadata_file (str): Path to metadata CSV file
    
    Returns:
    tuple: (training_df, test_df) with processed and merged data
    """
    # Read metadata
    metadata_df = pd.read_csv(metadata_file)
    
    # Process training and test folders
    connectome_df = process_connectome_folder(connectome_folder)
    
    # Merge with metadata
    final_df = pd.merge(connectome_df, metadata_df, on='participant_id', how='left')
    
    return final_df

In [6]:
if __name__ == "__main__":
    # Define paths (adjust these according to your directory structure)
    TRAIN_FOLDER = os.path.join("widsdatathon2025-university", "train_tsv", "train_tsv")
    TRAIN_METADATA_FILE = os.path.join("widsdatathon2025-university", "metadata", "training_metadata.csv")
    TEST_FOLDER = os.path.join("widsdatathon2025-university", "test_tsv", "test_tsv")
    TEST_METADATA_FILE = os.path.join("widsdatathon2025-university", "metadata", "test_metadata.csv")
    
    # Process the dataset
    train_data = process_dataset(TRAIN_FOLDER, TRAIN_METADATA_FILE)
    
    # Save processed data
    train_data.to_csv("processed_train_data.csv", index=False)

    test_data = process_dataset(TEST_FOLDER, TEST_METADATA_FILE)

    test_data.to_csv("processed_test_data.csv", index=False)
    
    # Print basic information about the processed datasets
    print(f"Training set shape: {train_data.shape}")
    print(f"Test set shape: {test_data.shape}")

100%|██████████| 1104/1104 [00:28<00:00, 38.83it/s]
100%|██████████| 474/474 [00:14<00:00, 33.05it/s]


Training set shape: (1104, 20114)
Test set shape: (474, 20113)
