In [1]:
import os
from pathlib import Path
from scipy.stats import chi2_contingency
import pandas as pd

In [8]:
#Load and merge categorical data
BASE_DIR = Path("../../data")
RAW_DIR = BASE_DIR / "raw" / "widsdatathon2025"

train_cat = pd.read_excel(RAW_DIR / "TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx")
train_solutions = pd.read_excel(RAW_DIR / "TRAIN/TRAINING_SOLUTIONS.xlsx")

#create a female ADHD column
train_solutions["Female_ADHD"] = ((train_solutions["Sex_F"] == 1) & (train_solutions["ADHD_Outcome"] == 1)).astype(int)

combined_df = pd.merge(train_cat, train_solutions, on='participant_id')
combined_df.head()


Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,ADHD_Outcome,Sex_F,Female_ADHD
0,UmrK0vMLopoR,2016,1,0.0,0,1,21,45,21,45,1,1,1
1,CPaeQkhcjg7d,2019,3,1.0,2,3,15,15,0,0,1,0,0
2,Nb4EetVPm3gs,2016,1,1.0,8,1,18,40,0,0,1,0,0
3,p4vPhVu91o4b,2018,3,0.0,8,3,15,30,18,0,1,1,1
4,M09PXs7arQ5E,2019,3,0.0,1,3,15,20,0,0,1,1,1


Chi Square Test of ADHD_Outcome and every variable, printing chi sq and p value

In [9]:
# List of categorical columns (excluding participant_id & ADHD_Outcome)
cat_columns = [col for col in combined_df.columns if combined_df[col].nunique() < 10 and col not in ['participant_id', 'ADHD_Outcome']]
print("Chi-Square test of demographic variables and adhd outcome")
# Loop through categorical variables and run Chi-Square test
significant_cat = {} #initialize dictionary for p values that are < 0.05
for col in cat_columns:
    contingency_table = pd.crosstab(combined_df[col], combined_df['ADHD_Outcome'])
    chi2, p, _, _ = chi2_contingency(contingency_table)
    print(f"{col}: Chi-Square = {chi2:.3f}, p-value = {p:.5f}")
    if p < 0.05: #if p val < 0.05, add to dict
        significant_cat[col] = p

Chi-Square test of demographic variables and adhd outcome
Basic_Demos_Enroll_Year: Chi-Square = 46.945, p-value = 0.00000
Basic_Demos_Study_Site: Chi-Square = 10.538, p-value = 0.01450
PreInt_Demos_Fam_Child_Ethnicity: Chi-Square = 1.642, p-value = 0.64999
MRI_Track_Scan_Location: Chi-Square = 32.946, p-value = 0.00000
Barratt_Barratt_P1_Edu: Chi-Square = 2.862, p-value = 0.89744
Barratt_Barratt_P2_Edu: Chi-Square = 1.963, p-value = 0.96188
Sex_F: Chi-Square = 20.175, p-value = 0.00001
Female_ADHD: Chi-Square = 142.923, p-value = 0.00000


In [10]:
print("-----------------------")
print("Significant Categorical Features with ADHD Outcome (p < 0.05):")
for col, p in significant_cat.items(): #print dict w significant p vals
    print(f"{col}: p-value = {p:.5f}")

-----------------------
Significant Categorical Features with ADHD Outcome (p < 0.05):
Basic_Demos_Enroll_Year: p-value = 0.00000
Basic_Demos_Study_Site: p-value = 0.01450
MRI_Track_Scan_Location: p-value = 0.00000
Sex_F: p-value = 0.00001
Female_ADHD: p-value = 0.00000


Chi Square Test of Sex_F and every variable, printing chi sq and p value

In [11]:
# List of categorical columns (excluding participant_id & Sex_F)
cat_columns = [col for col in combined_df.columns if combined_df[col].nunique() < 10 and col not in ['participant_id', 'Sex_F']]
print("Chi-Square test of demographic variables and sex")
# Loop through categorical variables and run Chi-Square test
significant_cat2 = {} #initialize dictionary for p values that are < 0.05
for col in cat_columns:
    contingency_table = pd.crosstab(combined_df[col], combined_df['Sex_F'])
    chi2, p, _, _ = chi2_contingency(contingency_table)
    print(f"{col}: Chi-Square = {chi2:.3f}, p-value = {p:.5f}")
    if p < 0.05: #if p val < 0.05, add to dict
        significant_cat2[col] = p

Chi-Square test of demographic variables and sex
Basic_Demos_Enroll_Year: Chi-Square = 4.239, p-value = 0.51560
Basic_Demos_Study_Site: Chi-Square = 3.593, p-value = 0.30890
PreInt_Demos_Fam_Child_Ethnicity: Chi-Square = 1.035, p-value = 0.79269
MRI_Track_Scan_Location: Chi-Square = 7.897, p-value = 0.09543
Barratt_Barratt_P1_Edu: Chi-Square = 10.374, p-value = 0.16836
Barratt_Barratt_P2_Edu: Chi-Square = 10.466, p-value = 0.16365
ADHD_Outcome: Chi-Square = 20.175, p-value = 0.00001
Female_ADHD: Chi-Square = 599.641, p-value = 0.00000


In [12]:
print("-----------------------")
print("Significant Categorical Features with Sex (p < 0.05):")
for col, p in significant_cat2.items(): #print dict w significant p vals
    print(f"{col}: p-value = {p:.5f}")

-----------------------
Significant Categorical Features with Sex (p < 0.05):
ADHD_Outcome: p-value = 0.00001
Female_ADHD: p-value = 0.00000


Chi Sq between female_adhd and other variables

In [None]:
# List of categorical columns (excluding participant_id & Sex_F & ADHD_Outcome)
cat_columns = [col for col in combined_df.columns if col not in ['participant_id', 'Sex_F', 'ADHD_Outcome']]
# Loop through categorical variables and run Chi-Square test
# Initialize dictionaries for significant results
significant_female_adhd = {}
print("\nChi-Square test of demographic variables and Female_ADHD:")
for col in cat_columns:
    contingency_table = pd.crosstab(combined_df[col], combined_df['Female_ADHD'])
    chi2, p, _, _ = chi2_contingency(contingency_table)
    print(f"{col}: Chi-Square = {chi2:.3f}, p-value = {p:.5f}")
    if p < 0.05:  
        significant_female_adhd[col] = p

Chi-Square test of demographic variables and sex

Chi-Square test of demographic variables and Female_ADHD:
Basic_Demos_Enroll_Year: Chi-Square = 4.629, p-value = 0.46281
Basic_Demos_Study_Site: Chi-Square = 1.377, p-value = 0.71095
PreInt_Demos_Fam_Child_Ethnicity: Chi-Square = 1.765, p-value = 0.62268
PreInt_Demos_Fam_Child_Race: Chi-Square = 4.158, p-value = 0.90071
MRI_Track_Scan_Location: Chi-Square = 4.216, p-value = 0.37760
Barratt_Barratt_P1_Edu: Chi-Square = 8.745, p-value = 0.27151
Barratt_Barratt_P1_Occ: Chi-Square = 6.790, p-value = 0.65899
Barratt_Barratt_P2_Edu: Chi-Square = 4.308, p-value = 0.74371
Barratt_Barratt_P2_Occ: Chi-Square = 12.315, p-value = 0.19613
Female_ADHD: Chi-Square = 1206.896, p-value = 0.00000


In [14]:
print("-----------------------")
print("Significant Categorical Features with FemaleADHD (p < 0.05):")
for col, p in significant_female_adhd.items(): #print dict w significant p vals
    print(f"{col}: p-value = {p:.5f}")

-----------------------
Significant Categorical Features with FemaleADHD (p < 0.05):
Female_ADHD: p-value = 0.00000
