In [3]:
import os
from pathlib import Path
from scipy.stats import chi2_contingency
import pandas as pd

In [4]:
#Load and merge categorical data
BASE_DIR = Path("../../data")
RAW_DIR = BASE_DIR / "raw" / "widsdatathon2025"

train_quant = pd.read_excel(RAW_DIR / "TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx")
train_solutions = pd.read_excel(RAW_DIR / "TRAIN/TRAINING_SOLUTIONS.xlsx")

#create a female ADHD column
train_solutions["Female_ADHD"] = ((train_solutions["Sex_F"] == 1) & (train_solutions["ADHD_Outcome"] == 1)).astype(int)

combined_df = pd.merge(train_quant, train_solutions, on='participant_id')
combined_df.head()

Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,...,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan,ADHD_Outcome,Sex_F,Female_ADHD
0,UmrK0vMLopoR,40.0,13,3,10,47,13,11,28,0,...,5,0,5,1,0,10,,1,1,1
1,CPaeQkhcjg7d,-94.47,14,3,13,34,18,23,30,0,...,8,7,8,10,4,5,,1,0,0
2,Nb4EetVPm3gs,-46.67,14,4,10,35,16,10,29,1,...,8,5,7,6,4,9,8.239904,1,0,0
3,p4vPhVu91o4b,-26.68,10,5,12,39,19,16,28,6,...,16,9,10,8,4,6,,1,1,1
4,M09PXs7arQ5E,0.0,14,5,15,40,20,24,28,1,...,11,4,10,7,3,9,8.940679,1,1,1


Chi Square Test of ADHD_Outcome and every variable, printing chi sq and p value

In [5]:
# List of numerical columns (excluding participant_id & ADHD_Outcome)
num_columns = [col for col in combined_df.columns if col not in ['participant_id', 'ADHD_Outcome']]
print("Chi-Square test of numerical variables and adhd outcome")
# Loop through numerica; variables and run Chi-Square test
significant_num = {}
for col in num_columns:
    contingency_table = pd.crosstab(combined_df[col], combined_df['ADHD_Outcome'])
    chi2, p, _, _ = chi2_contingency(contingency_table)
    print(f"{col}: Chi-Square = {chi2:.3f}, p-value = {p:.5f}")
    if p < 0.05:
        significant_num[col] = p

Chi-Square test of numerical variables and adhd outcome
EHQ_EHQ_Total: Chi-Square = 157.543, p-value = 0.47280
ColorVision_CV_Score: Chi-Square = 12.625, p-value = 0.47719
APQ_P_APQ_P_CP: Chi-Square = 13.653, p-value = 0.18940
APQ_P_APQ_P_ID: Chi-Square = 37.107, p-value = 0.01636
APQ_P_APQ_P_INV: Chi-Square = 36.513, p-value = 0.08262
APQ_P_APQ_P_OPD: Chi-Square = 50.223, p-value = 0.00034
APQ_P_APQ_P_PM: Chi-Square = 35.693, p-value = 0.21829
APQ_P_APQ_P_PP: Chi-Square = 20.703, p-value = 0.23986
SDQ_SDQ_Conduct_Problems: Chi-Square = 106.177, p-value = 0.00000
SDQ_SDQ_Difficulties_Total: Chi-Square = 314.505, p-value = 0.00000
SDQ_SDQ_Emotional_Problems: Chi-Square = 63.160, p-value = 0.00000
SDQ_SDQ_Externalizing: Chi-Square = 366.308, p-value = 0.00000
SDQ_SDQ_Generating_Impact: Chi-Square = 245.953, p-value = 0.00000
SDQ_SDQ_Hyperactivity: Chi-Square = 414.547, p-value = 0.00000
SDQ_SDQ_Internalizing: Chi-Square = 101.508, p-value = 0.00000
SDQ_SDQ_Peer_Problems: Chi-Square = 68.

In [6]:
print("-----------------------")
print("Significant Numerical Features with ADHD Outcome (p < 0.05):")
for col, p in significant_num.items():
    print(f"{col}: p-value = {p:.5f}")

-----------------------
Significant Numerical Features with ADHD Outcome (p < 0.05):
APQ_P_APQ_P_ID: p-value = 0.01636
APQ_P_APQ_P_OPD: p-value = 0.00034
SDQ_SDQ_Conduct_Problems: p-value = 0.00000
SDQ_SDQ_Difficulties_Total: p-value = 0.00000
SDQ_SDQ_Emotional_Problems: p-value = 0.00000
SDQ_SDQ_Externalizing: p-value = 0.00000
SDQ_SDQ_Generating_Impact: p-value = 0.00000
SDQ_SDQ_Hyperactivity: p-value = 0.00000
SDQ_SDQ_Internalizing: p-value = 0.00000
SDQ_SDQ_Peer_Problems: p-value = 0.00000
SDQ_SDQ_Prosocial: p-value = 0.00000
Sex_F: p-value = 0.00001
Female_ADHD: p-value = 0.00000


Chi Square Test of Sex_F and every variable, printing chi sq and p value

In [7]:
# List of  numerical columns (excluding participant_id & Sex_F)
num_columns = [col for col in combined_df.columns if col not in ['participant_id', 'Sex_F']]
print("Chi-Square test of demographic variables and sex")
# Loop through categorical variables and run Chi-Square test
significant_num2 = {}
for col in num_columns:
    contingency_table = pd.crosstab(combined_df[col], combined_df['Sex_F'])
    chi2, p, _, _ = chi2_contingency(contingency_table)
    print(f"{col}: Chi-Square = {chi2:.3f}, p-value = {p:.5f}")
    if p < 0.05:
        significant_num2[col] = p

Chi-Square test of demographic variables and sex
EHQ_EHQ_Total: Chi-Square = 166.483, p-value = 0.28700
ColorVision_CV_Score: Chi-Square = 19.523, p-value = 0.10777
APQ_P_APQ_P_CP: Chi-Square = 7.371, p-value = 0.69000
APQ_P_APQ_P_ID: Chi-Square = 27.355, p-value = 0.15942
APQ_P_APQ_P_INV: Chi-Square = 31.227, p-value = 0.21994
APQ_P_APQ_P_OPD: Chi-Square = 20.859, p-value = 0.46759
APQ_P_APQ_P_PM: Chi-Square = 19.130, p-value = 0.93723
APQ_P_APQ_P_PP: Chi-Square = 20.501, p-value = 0.24940
SDQ_SDQ_Conduct_Problems: Chi-Square = 8.119, p-value = 0.61722
SDQ_SDQ_Difficulties_Total: Chi-Square = 31.398, p-value = 0.54694
SDQ_SDQ_Emotional_Problems: Chi-Square = 20.717, p-value = 0.02316
SDQ_SDQ_Externalizing: Chi-Square = 35.110, p-value = 0.01952
SDQ_SDQ_Generating_Impact: Chi-Square = 9.723, p-value = 0.46509
SDQ_SDQ_Hyperactivity: Chi-Square = 33.435, p-value = 0.00023
SDQ_SDQ_Internalizing: Chi-Square = 27.062, p-value = 0.05717
SDQ_SDQ_Peer_Problems: Chi-Square = 5.316, p-value = 0.

In [8]:
print("-----------------------")
print("Significant Numerical Features with Sex (p < 0.05):")
for col, p in significant_num2.items():
    print(f"{col}: p-value = {p:.5f}")

-----------------------
Significant Numerical Features with Sex (p < 0.05):
SDQ_SDQ_Emotional_Problems: p-value = 0.02316
SDQ_SDQ_Externalizing: p-value = 0.01952
SDQ_SDQ_Hyperactivity: p-value = 0.00023
SDQ_SDQ_Prosocial: p-value = 0.00301
ADHD_Outcome: p-value = 0.00001
Female_ADHD: p-value = 0.00000


Chi Sq between female_adhd and other variables

In [9]:
# List of categorical columns (excluding participant_id & Sex_F & ADHD_Outcome)
num_columns = [col for col in combined_df.columns if col not in ['participant_id', 'Sex_F', 'ADHD_Outcome']]
print("Chi-Square test of numerical variables and female adhd")
# Loop through categorical variables and run Chi-Square test
# Initialize dictionaries for significant results
significant_female_adhd = {}
print("\nChi-Square test of demographic variables and Female_ADHD:")
for col in num_columns:
    contingency_table = pd.crosstab(combined_df[col], combined_df['Female_ADHD'])
    chi2, p, _, _ = chi2_contingency(contingency_table)
    print(f"{col}: Chi-Square = {chi2:.3f}, p-value = {p:.5f}")
    if p < 0.05:  
        significant_female_adhd[col] = p

Chi-Square test of numerical variables and female adhd

Chi-Square test of demographic variables and Female_ADHD:
EHQ_EHQ_Total: Chi-Square = 148.935, p-value = 0.66471
ColorVision_CV_Score: Chi-Square = 10.833, p-value = 0.62484
APQ_P_APQ_P_CP: Chi-Square = 4.627, p-value = 0.91463
APQ_P_APQ_P_ID: Chi-Square = 28.119, p-value = 0.13680
APQ_P_APQ_P_INV: Chi-Square = 23.695, p-value = 0.59337
APQ_P_APQ_P_OPD: Chi-Square = 12.421, p-value = 0.92755
APQ_P_APQ_P_PM: Chi-Square = 22.813, p-value = 0.82287
APQ_P_APQ_P_PP: Chi-Square = 28.323, p-value = 0.04130
SDQ_SDQ_Conduct_Problems: Chi-Square = 11.263, p-value = 0.33739
SDQ_SDQ_Difficulties_Total: Chi-Square = 65.840, p-value = 0.00058
SDQ_SDQ_Emotional_Problems: Chi-Square = 51.994, p-value = 0.00000
SDQ_SDQ_Externalizing: Chi-Square = 43.492, p-value = 0.00176
SDQ_SDQ_Generating_Impact: Chi-Square = 53.811, p-value = 0.00000
SDQ_SDQ_Hyperactivity: Chi-Square = 42.789, p-value = 0.00001
SDQ_SDQ_Internalizing: Chi-Square = 47.978, p-valu

In [10]:
print("-----------------------")
print("Significant Numerical Features with FemaleADHD (p < 0.05):")
for col, p in significant_female_adhd.items(): #print dict w significant p vals
    print(f"{col}: p-value = {p:.5f}")

-----------------------
Significant Numerical Features with FemaleADHD (p < 0.05):
APQ_P_APQ_P_PP: p-value = 0.04130
SDQ_SDQ_Difficulties_Total: p-value = 0.00058
SDQ_SDQ_Emotional_Problems: p-value = 0.00000
SDQ_SDQ_Externalizing: p-value = 0.00176
SDQ_SDQ_Generating_Impact: p-value = 0.00000
SDQ_SDQ_Hyperactivity: p-value = 0.00001
SDQ_SDQ_Internalizing: p-value = 0.00009
SDQ_SDQ_Peer_Problems: p-value = 0.02035
Female_ADHD: p-value = 0.00000
