In [1]:
import pandas as pd
from scipy.stats import chi2_contingency
import numpy as np

In [2]:
# Reading dataset
df = pd.read_csv(r'C:\Users\Kiran\Desktop\Mental Illness Prediction\dataset_after_feature_engineering.csv'
                 ) 

df.head(5)

Unnamed: 0,Age Group,Household Composition,Special Education Services,Mental Illness,No Chronic Med Condition,Smokes,Serious Mental Illness,Unknown Insurance Coverage,Criminal Justice Status,Program_Category,...,Heartchronic_Summary,Disorder_summary,Other_Chronic_Illness_Summmary,Brainchronic_Summary,Insured_or_Not,Has_Public_Insurance,Has_Private_or_Other_Insurance,Confirmed_Medicaid_Managed,Gender_Identity_Orientation,Receiving Cash Assistance
0,ADULT,COHABITATES WITH OTHERS,NOT APPLICABLE,YES,YES,NO,YES,NO,NO,Regular Treatment,...,"NO, HEART CHRONIC ILLNESS",NO DISORDER,"NO, CHRONIC ILLNESS","NO, BRAIN CHRONIC ILLNESS",Yes,Yes,No,Yes,Cisgender Man,No/Unknown
1,ADULT,LIVES ALONE,NOT APPLICABLE,YES,YES,NO,YES,NO,NO,Regular Treatment,...,"NO, HEART CHRONIC ILLNESS",NO DISORDER,"NO, CHRONIC ILLNESS","NO, BRAIN CHRONIC ILLNESS",Yes,Yes,No,Yes,Cisgender Man,No/Unknown
2,ADULT,COHABITATES WITH OTHERS,NOT APPLICABLE,YES,YES,YES,YES,NO,NO,Regular Treatment,...,"NO, HEART CHRONIC ILLNESS",ALCOHOL/DRUG DISORDER,"NO, CHRONIC ILLNESS","NO, BRAIN CHRONIC ILLNESS",Yes,Yes,No,Yes,Cisgender Man,No/Unknown
3,ADULT,NOT APPLICABLE,NOT APPLICABLE,YES,YES,YES,YES,NO,NO,Regular Treatment,...,"NO, HEART CHRONIC ILLNESS",ALCOHOL/DRUG DISORDER,"NO, CHRONIC ILLNESS","NO, BRAIN CHRONIC ILLNESS",Yes,Yes,No,No,Cisgender Man,Yes
4,ADULT,COHABITATES WITH OTHERS,NOT APPLICABLE,YES,NO,YES,YES,NO,NO,Regular Treatment,...,"NO, HEART CHRONIC ILLNESS",ALCOHOL/DRUG DISORDER,"NO, CHRONIC ILLNESS","NO, BRAIN CHRONIC ILLNESS",Yes,Yes,No,Yes,Cisgender Woman,Yes


In [3]:
target_col = 'Mental Illness'
categorical_cols = [col for col in df.columns if col != target_col]

In [4]:
# Perfromng Chi-Square test to check the significance of categorical features with respect to the target variable.

# Purpose:
#   To check if there is a significant association between two categorical variables.
#
# How it works:
#   - Compares observed frequencies vs. expected frequencies (if variables were independent).
#   - Formula: χ² = Σ ((O - E)² / E)
#     where O = Observed frequency, E = Expected frequency.
#
# Hypotheses:
#   H₀ (Null): Variables are independent (no association).
#   H₁ (Alt): Variables are dependent (association exists).
#
# p-value interpretation:
#   - p < 0.05 → Reject H₀ → Significant association.
#   - p ≥ 0.05 → Fail to reject H₀ → No significant association.
#
#   - Use for categorical variables only.

# Cramér's V (Effect Size for Chi-Square Test)
# --------------------------------------------
# Purpose:
#   To measure the strength of association between two categorical variables.
#
# Formula:
#   V = sqrt( χ² / (n * (k - 1)) )
#     where:
#       χ² = Chi-square statistic
#       n = Total sample size
#       k = min(number of rows, number of columns)
#
# Interpretation (general guideline):
#   0.00 - 0.10 → Very weak
#   0.11 - 0.30 → Weak to moderate
#   0.31 - 0.50 → Moderate to strong
#   > 0.50 → Very strong
#
# Notes:
#   - Unlike p-value, Cramér's V shows strength, not just significance.
#   - Values range between 0 (no association) and 1 (perfect association).


# Bonferroni Correction (Multiple Comparisons Adjustment)
# -------------------------------------------------------
# Purpose:
#   To control the family-wise error rate (FWER) when performing multiple hypothesis tests.
#   Without correction, the probability of at least one false positive (Type I error) increases
#   as the number of tests grows.
#
# Formula:
#   Adjusted α (alpha) = Original α / Number of tests
#     where:
#       Original α = Significance level (commonly 0.05)
#       Number of tests = Total number of independent statistical tests performed
#
# Interpretation:
#   - Compare each test's p-value to the adjusted α instead of the original α.
#   - If p-value < adjusted α → Result remains statistically significant after correction.
#
# Example:
#   If α = 0.05 and 10 tests → Adjusted α = 0.05 / 10 = 0.005
#   A p-value must be < 0.005 to be considered significant after Bonferroni correction.
#   - Bonferroni is conservative; reduces false positives but increases false negatives.
#   - Works best when tests are independent.
#   - Alternative methods for multiple comparisons: Holm-Bonferroni, FDR (Benjamini-Hochberg).


# Chi-Square Test and Bonferroni Correction

- **Chi-Square Test**:
    - Used to check if there is an association between two categorical variables.
    - Null Hypothesis (H₀): The variables are independent (no association).
    - Alternative Hypothesis (H₁): The variables are not independent (association exists).
    - A smaller **p-value (< 0.05)** indicates evidence to reject H₀.

- **Cramér’s V**:
    - Measures the strength of association between categorical variables.
    - Interpretation:
        - 0.00 – 0.10 → Very Weak
        - 0.11 – 0.30 → Weak to Moderate
        - 0.31 – 0.50 → Moderate to Strong
        - > 0.50 → Very Strong

- **Bonferroni Correction**:
    - When performing multiple hypothesis tests, the chance of Type I error (false positive) increases.
    - Bonferroni correction adjusts the significance level:  
      **Adjusted α = Original α / Number of Tests**
    - Example: If α = 0.05 and 11 tests → Adjusted α = 0.00455
    - Only consider p-values < adjusted α as statistically significant after correction.


In [5]:
# Function to compute Cramér's V
def cramers_v(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    k = min(confusion_matrix.shape)
    return np.sqrt(chi2 / (n * (k - 1)))

# Function to interpret Cramér's V
def interpret_cramers_v(v):
    if v <= 0.10:
        return "Very Weak"
    elif v <= 0.30:
        return "Weak to Moderate"
    elif v <= 0.50:
        return "Moderate to Strong"
    else:
        return "Very Strong"

# Target variable
target_var = 'Mental Illness'  # Change this to your actual target column

# Select categorical columns (excluding target)
categorical_cols = [col for col in df.columns if col != target_var and df[col].dtype == 'object']

# Number of tests for Bonferroni correction
num_tests = len(categorical_cols)
alpha = 0.05
bonferroni_alpha = alpha / num_tests

# Collect results
results = []

for col in categorical_cols:
    # Crosstab
    ctab = pd.crosstab(df[col], df[target_var])

    # Chi-square test
    chi2, p, dof, expected = chi2_contingency(ctab)

    # Bonferroni significance
    bonferroni_significant = 'Yes' if p < bonferroni_alpha else 'No'

    # Cramér's V
    v = cramers_v(ctab)
    interpretation = interpret_cramers_v(v)

    results.append({
        'Column': col,
        'Chi-Square': round(chi2, 2),
        'p-value': round(p, 4),
        'Significant (p<0.05)': 'Yes' if p < 0.05 else 'No',
        'Bonferroni Alpha': round(bonferroni_alpha, 5),
        'Significant (Bonferroni)': bonferroni_significant,
        "Cramer's V": round(v, 3),
        'Interpretation': interpretation
    })

# Convert to DataFrame
summary_results = pd.DataFrame(results)


# Display the summary table
summary_results.head(50)


Unnamed: 0,Column,Chi-Square,p-value,Significant (p<0.05),Bonferroni Alpha,Significant (Bonferroni),Cramer's V,Interpretation
0,Age Group,2110.14,0.0,Yes,0.00135,Yes,0.104,Weak to Moderate
1,Household Composition,499.65,0.0,Yes,0.00135,Yes,0.051,Very Weak
2,Special Education Services,2458.24,0.0,Yes,0.00135,Yes,0.113,Weak to Moderate
3,No Chronic Med Condition,1686.87,0.0,Yes,0.00135,Yes,0.093,Very Weak
4,Smokes,449.41,0.0,Yes,0.00135,Yes,0.048,Very Weak
5,Serious Mental Illness,72269.39,0.0,Yes,0.00135,Yes,0.61,Very Strong
6,Unknown Insurance Coverage,73.06,0.0,Yes,0.00135,Yes,0.019,Very Weak
7,Criminal Justice Status,87.47,0.0,Yes,0.00135,Yes,0.021,Very Weak
8,Program_Category,1245.44,0.0,Yes,0.00135,Yes,0.08,Very Weak
9,Region_Served,1.23,0.5409,No,0.00135,No,0.003,Very Weak


In [6]:
# 'Cultural Group', 'Veteran_Status' , 'Region_Served'  are the non-significant columns, dropping them
df.drop(columns=['Cultural Group', 'Veteran_Status' , 'Region_Served'], inplace=True)

In [7]:

# During Chi-Square feature selection, the column "Serious Mental Illness"
# showed a very strong association with the target variable (Cramer's V = 0.61).
# While this indicates high predictive power, it also suggests strong information leakage,
# since "Serious Mental Illness" is essentially another way of stating the target outcome.
#
# Keeping this feature would artificially inflate model performance,
# because the model would learn to "cheat" by relying almost entirely on this column
# instead of discovering useful patterns from other features.
#
# To build a fair and generalizable model, we decided to DROP "Serious Mental Illness"
# from the modeling features. However, it is still reported in the EDA section
# to highlight its strong statistical significance.

df.drop(columns=['Serious Mental Illness'], inplace=True)


In [8]:
for i in df.columns:
   print(f"{i} -  {df[i].unique()}")

Age Group -  ['ADULT' 'CHILD' 'UNKNOWN']
Household Composition -  ['COHABITATES WITH OTHERS' 'LIVES ALONE' 'NOT APPLICABLE' 'UNKNOWN']
Special Education Services -  ['NOT APPLICABLE' 'YES' 'NO' 'UNKNOWN']
Mental Illness -  ['YES' 'NO']
No Chronic Med Condition -  ['YES' 'NO' 'UNKNOWN']
Smokes -  ['NO' 'YES' 'UNKNOWN']
Unknown Insurance Coverage -  ['NO' 'YES']
Criminal Justice Status -  ['NO' 'YES' 'UNKNOWN']
Program_Category -  ['Regular Treatment' 'Extra Help' 'Urgent Care']
Religion_Category -  ['Unknown' 'Formal Religion' 'Spiritual but not Religious']
Employment_Status -  ['Employed' 'Not in Labor Force' 'Unemployed' 'Unknown']
Hours_Category -  ['Part-Time' 'Full-Time' 'Unknown']
Education_Category -  ['Higher Education' 'Secondary Education' 'Unknown' 'Primary Education'
 'No Formal Education']
RACE -  ['WHITE' 'OTHER/MULTIRACIAL' 'BLACK' 'UNKNOWN']
hispanic_ethnicity -  ['HISPANIC' 'NON-HISPANIC' 'UNKNOWN']
Living_Situation -  ['PRIVATE RESIDENCE' 'OTHER' 'INSTITUTIONAL/UNKNOWN

In [10]:
df.to_csv('final_data_for_model_building.csv', index=False)