In [39]:
import os
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [40]:
# Training solutions dataframe
training_solutions = pd.read_excel("../../data/raw/widsdatathon2025/TRAIN/TRAINING_SOLUTIONS.xlsx")
training_solutions.head()

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,1,1
1,CPaeQkhcjg7d,1,0
2,Nb4EetVPm3gs,1,0
3,p4vPhVu91o4b,1,1
4,M09PXs7arQ5E,1,1


In [41]:
# Categorical data dataframe
train_cat = pd.read_excel("../../data/raw/widsdatathon2025/TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx")
train_cat.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,UmrK0vMLopoR,2016,1,0.0,0,1,21,45,21,45
1,CPaeQkhcjg7d,2019,3,1.0,2,3,15,15,0,0
2,Nb4EetVPm3gs,2016,1,1.0,8,1,18,40,0,0
3,p4vPhVu91o4b,2018,3,0.0,8,3,15,30,18,0
4,M09PXs7arQ5E,2019,3,0.0,1,3,15,20,0,0


In [42]:
# Quantitative data dataframe
train_quant = pd.read_excel("../../data/raw/widsdatathon2025/TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx")
train_quant.head()

Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,UmrK0vMLopoR,40.0,13,3,10,47,13,11,28,0,6,1,5,0,5,1,0,10,
1,CPaeQkhcjg7d,-94.47,14,3,13,34,18,23,30,0,18,6,8,7,8,10,4,5,
2,Nb4EetVPm3gs,-46.67,14,4,10,35,16,10,29,1,14,2,8,5,7,6,4,9,8.239904
3,p4vPhVu91o4b,-26.68,10,5,12,39,19,16,28,6,24,4,16,9,10,8,4,6,
4,M09PXs7arQ5E,0.0,14,5,15,40,20,24,28,1,18,4,11,4,10,7,3,9,8.940679


In [43]:
# Combine categorical training data with labels
combined_df = pd.merge(train_cat, training_solutions, on='participant_id')
combined_df.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,2016,1,0.0,0,1,21,45,21,45,1,1
1,CPaeQkhcjg7d,2019,3,1.0,2,3,15,15,0,0,1,0
2,Nb4EetVPm3gs,2016,1,1.0,8,1,18,40,0,0,1,0
3,p4vPhVu91o4b,2018,3,0.0,8,3,15,30,18,0,1,1
4,M09PXs7arQ5E,2019,3,0.0,1,3,15,20,0,0,1,1


In [44]:
# Combine quantitative training data with labels
combined_df_q = pd.merge(train_quant, training_solutions, on='participant_id')
combined_df_q.head()

Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,...,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,40.0,13,3,10,47,13,11,28,0,...,1,5,0,5,1,0,10,,1,1
1,CPaeQkhcjg7d,-94.47,14,3,13,34,18,23,30,0,...,6,8,7,8,10,4,5,,1,0
2,Nb4EetVPm3gs,-46.67,14,4,10,35,16,10,29,1,...,2,8,5,7,6,4,9,8.239904,1,0
3,p4vPhVu91o4b,-26.68,10,5,12,39,19,16,28,6,...,4,16,9,10,8,4,6,,1,1
4,M09PXs7arQ5E,0.0,14,5,15,40,20,24,28,1,...,4,11,4,10,7,3,9,8.940679,1,1


In [59]:
category_mappings = {
    'PreInt_Demos_Fam_Child_Ethnicity': {0: 'Not Hispanic or Latino', 1: 'Hispanic or Latino', 2: 'Decline to specify', 3: 'Unknown'},
    'PreInt_Demos_Fam_Child_Race': {0: 'White/Caucasian', 1: 'Black/African American', 2: 'Hispanic', 3: 'Asian', 4: 'Indian',
                                     5: 'Native American Indian', 6: 'American Indian/Alaskan Native',
                                     7: 'Native Hawaiian/Other Pacific Islander', 8: 'Two or more races',
                                     9: 'Other race', 10: 'Unknown', 11: 'Choose not to specify'},
    'MRI_Track_Scan_Location': {1: 'Staten Island', 2: 'RUBIC', 3: 'CBIC', 4: 'CUNY'},
    'Basic_Demos_Study_Site': {1: 'Staten Island', 2: 'MRV', 3: 'Midtown', 4: 'Harlem', 5: 'SI RUMC'},
    'Barratt_Barratt_P1_Edu': {3: 'Less than 7th grade', 6: 'Junior high (9th grade)', 9: 'Partial high school', 12: 'High school graduate',
                               15: 'Partial college', 18: 'College education', 21: 'Graduate degree'},
    'Barratt_Barratt_P2_Edu': {3: 'Less than 7th grade', 6: 'Junior high (9th grade)', 9: 'Partial high school', 12: 'High school graduate',
                               15: 'Partial college', 18: 'College education', 21: 'Graduate degree'},
    'Barratt_Barratt_P1_Occ': {0: 'Homemaker', 5: 'Day laborer', 10: 'Garbage collector', 15: 'Painter', 20: 'Mechanic',
                               25: 'Machinist', 30: 'Supervisor', 35: 'Nurse', 40: 'Engineer', 45: 'Physician'},
    'Barratt_Barratt_P2_Occ': {0: 'Homemaker', 5: 'Day laborer', 10: 'Garbage collector', 15: 'Painter', 20: 'Mechanic',
                               25: 'Machinist', 30: 'Supervisor', 35: 'Nurse', 40: 'Engineer', 45: 'Physician'}
}

In [66]:
adhd_percentages_dict = {}

for column in combined_df.drop("participant_id", axis=1):
    adhd_percentages = combined_df.groupby(column)['ADHD_Outcome'].mean()
    
    if column in category_mappings:
        adhd_percentages.index = adhd_percentages.index.map(category_mappings[column])
    
    adhd_percentages_dict[column] = adhd_percentages 

In [68]:
print(adhd_percentages_dict["Basic_Demos_Study_Site"])
# print(adhd_percentages_dict["MRI_Track_Scan_Location"])
# print(adhd_percentages_dict["Basic_Demos_Enroll_Year"])
# print(adhd_percentages_dict["PreInt_Demos_Fam_Child_Ethnicity"])
# print(adhd_percentages_dict["PreInt_Demos_Fam_Child_Race"])
# print(adhd_percentages_dict["Barratt_Barratt_P1_Edu"])
# print(adhd_percentages_dict["Barratt_Barratt_P2_Edu"])
# print(adhd_percentages_dict["Barratt_Barratt_P1_Occ"])
# print(adhd_percentages_dict["Barratt_Barratt_P2_Occ"])

Basic_Demos_Study_Site
Staten Island    0.645706
MRV              0.818182
Midtown          0.730233
Harlem           0.725000
Name: ADHD_Outcome, dtype: float64


In [69]:
sex_percentages_dict = {}

for column in combined_df.drop("participant_id", axis=1):
    sex_percentages = combined_df.groupby(column)['Sex_F'].mean()
    
    if column in category_mappings:
        sex_percentages.index = sex_percentages.index.map(category_mappings[column])
    
    sex_percentages_dict[column] = sex_percentages 

In [70]:
print(sex_percentages_dict["Basic_Demos_Study_Site"])
# print(sex_percentages_dict["MRI_Track_Scan_Location"])
# print(sex_percentages_dict["Basic_Demos_Enroll_Year"])
# print(sex_percentages_dict["PreInt_Demos_Fam_Child_Ethnicity"])
# print(sex_percentages_dict["PreInt_Demos_Fam_Child_Race"])
# print(sex_percentages_dict["Barratt_Barratt_P1_Edu"])
# print(sex_percentages_dict["Barratt_Barratt_P2_Edu"])
# print(sex_percentages_dict["Barratt_Barratt_P1_Occ"])
# print(sex_percentages_dict["Barratt_Barratt_P2_Occ"])

Basic_Demos_Study_Site
Staten Island    0.365031
MRV              0.363636
Midtown          0.309302
Harlem           0.341667
Name: Sex_F, dtype: float64
