In [1]:
# Importing libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

#import the remaining librairies to be used
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    recall_score,
    precision_score,
    classification_report,
)

In [2]:
# Reading dataset
df_final = pd.read_csv(r'final_data_for_model_building.csv'
                 ) 

df_final.head(5)

Unnamed: 0,Age Group,Household Composition,Special Education Services,Mental Illness,No Chronic Med Condition,Smokes,Unknown Insurance Coverage,Criminal Justice Status,Program_Category,Religion_Category,...,Heartchronic_Summary,Disorder_summary,Other_Chronic_Illness_Summmary,Brainchronic_Summary,Insured_or_Not,Has_Public_Insurance,Has_Private_or_Other_Insurance,Confirmed_Medicaid_Managed,Gender_Identity_Orientation,Receiving Cash Assistance
0,ADULT,COHABITATES WITH OTHERS,NOT APPLICABLE,YES,YES,NO,NO,NO,Regular Treatment,Unknown,...,"NO, HEART CHRONIC ILLNESS",NO DISORDER,"NO, CHRONIC ILLNESS","NO, BRAIN CHRONIC ILLNESS",Yes,Yes,No,Yes,Cisgender Man,No/Unknown
1,ADULT,LIVES ALONE,NOT APPLICABLE,YES,YES,NO,NO,NO,Regular Treatment,Formal Religion,...,"NO, HEART CHRONIC ILLNESS",NO DISORDER,"NO, CHRONIC ILLNESS","NO, BRAIN CHRONIC ILLNESS",Yes,Yes,No,Yes,Cisgender Man,No/Unknown
2,ADULT,COHABITATES WITH OTHERS,NOT APPLICABLE,YES,YES,YES,NO,NO,Regular Treatment,Formal Religion,...,"NO, HEART CHRONIC ILLNESS",ALCOHOL/DRUG DISORDER,"NO, CHRONIC ILLNESS","NO, BRAIN CHRONIC ILLNESS",Yes,Yes,No,Yes,Cisgender Man,No/Unknown
3,ADULT,NOT APPLICABLE,NOT APPLICABLE,YES,YES,YES,NO,NO,Regular Treatment,Unknown,...,"NO, HEART CHRONIC ILLNESS",ALCOHOL/DRUG DISORDER,"NO, CHRONIC ILLNESS","NO, BRAIN CHRONIC ILLNESS",Yes,Yes,No,No,Cisgender Man,Yes
4,ADULT,COHABITATES WITH OTHERS,NOT APPLICABLE,YES,NO,YES,NO,NO,Regular Treatment,Spiritual but not Religious,...,"NO, HEART CHRONIC ILLNESS",ALCOHOL/DRUG DISORDER,"NO, CHRONIC ILLNESS","NO, BRAIN CHRONIC ILLNESS",Yes,Yes,No,Yes,Cisgender Woman,Yes


In [3]:
df_final.columns

Index(['Age Group', 'Household Composition', 'Special Education Services',
       'Mental Illness', 'No Chronic Med Condition', 'Smokes',
       'Unknown Insurance Coverage', 'Criminal Justice Status',
       'Program_Category', 'Religion_Category', 'Employment_Status',
       'Hours_Category', 'Education_Category', 'RACE', 'hispanic_ethnicity',
       'Living_Situation', 'Diagnosis_Summary', 'Mental_Disability_Summary',
       'Impairment_Summary', 'Chronic_disease_Summary',
       'Canabis_Usage_Summary', 'Smoking treatment_summary',
       'Service_drug_alcohol_Summary', 'Other_testchronic_group_Summary',
       'Heartchronic_Summary', 'Disorder_summary',
       'Other_Chronic_Illness_Summmary', 'Brainchronic_Summary',
       'Insured_or_Not', 'Has_Public_Insurance',
       'Has_Private_or_Other_Insurance', 'Confirmed_Medicaid_Managed',
       'Gender_Identity_Orientation', 'Receiving Cash Assistance'],
      dtype='object')

In [4]:
for i in df_final.columns:
    print(i, df_final[i].nunique())

Age Group 3
Household Composition 4
Special Education Services 4
Mental Illness 2
No Chronic Med Condition 3
Smokes 3
Unknown Insurance Coverage 2
Criminal Justice Status 3
Program_Category 3
Religion_Category 3
Employment_Status 4
Hours_Category 3
Education_Category 5
RACE 4
hispanic_ethnicity 3
Living_Situation 3
Diagnosis_Summary 5
Mental_Disability_Summary 3
Impairment_Summary 3
Chronic_disease_Summary 2
Canabis_Usage_Summary 3
Smoking treatment_summary 3
Service_drug_alcohol_Summary 3
Other_testchronic_group_Summary 3
Heartchronic_Summary 3
Disorder_summary 3
Other_Chronic_Illness_Summmary 2
Brainchronic_Summary 3
Insured_or_Not 2
Has_Public_Insurance 2
Has_Private_or_Other_Insurance 2
Confirmed_Medicaid_Managed 2
Gender_Identity_Orientation 6
Receiving Cash Assistance 2


In [5]:
df_model = df_final.copy()

In [6]:
# 1) Label-encode Education_Category (ordered: low -> high)
#    We'll map "Unknown" to -1 to avoid implying a false order.
edu_order = {
    'No Formal Education': 0,
    'Primary Education': 1,
    'Secondary Education': 2,
    'Higher Education': 3,
    'Unknown': -1,  # keeping unknown separate; avoids misleading ordinal meaning
}

if 'Education_Category' in df_model.columns:
    df_model['Education_Category'] = (
        df_model['Education_Category'].astype(str).map(lambda x: x.strip()).map(edu_order).fillna(-1).astype(int)
    )


In [7]:
# encode a binary categorical column into a single 0/1 column
#    - Replaces the original column name with its dummy (drop_first=True).
#    - For labels like ['NO','YES'], the resulting column will be 1 for "YES".

def encode_binary_column(df, col):
    if col not in df.columns:
        return
    dummies = pd.get_dummies(df[col], drop_first=True, dtype=int)
    # If the column somehow isn't binary, skip safely
    if dummies.shape[1] != 1:
        return
    df.drop(columns=[col], inplace=True)
    df[col] = dummies.iloc[:, 0]

# Encode all required binary columns (2 categories)
# Target is 'Mental Illness' (Yes/No), that is also encoded here.

binary_cols = [
    'Mental Illness',
    'Unknown Insurance Coverage',
    'Insured_or_Not',
    'Receiving Cash Assistance',
    'Has_Public_Insurance',
    'Has_Private_or_Other_Insurance',
    'Confirmed_Medicaid_Managed',
    'Chronic_disease_Summary',
    'Other_Chronic_Illness_Summmary',
]

for col in binary_cols:
    encode_binary_column(df_model, col)

In [8]:

# df_model['mental_illness'] = df_model['Mental Illness']
# 5) One-hot encode all remaining multi-class categorical columns (3+ categories)
#    We’ll one-hot them in one shot. For tree models (RF/XGB), drop_first=False is fine.

multiclass_cols = [
    'Age Group',
    'Household Composition',
    'Special Education Services',
    'No Chronic Med Condition',
    'Smokes',
    'Criminal Justice Status',
    'Program_Category',
    'Religion_Category',
    'Employment_Status',
    'Hours_Category',
    'RACE',
    'hispanic_ethnicity',
    'Living_Situation',
    'Gender_Identity_Orientation',
    'Diagnosis_Summary',
    'Mental_Disability_Summary',
    'Impairment_Summary',
    'Canabis_Usage_Summary',
    'Smoking treatment_summary',
    'Service_drug_alcohol_Summary',
    'Other_testchronic_group_Summary',
    'Heartchronic_Summary',
    'Disorder_summary',
    'Brainchronic_Summary',
]

# Remove any columns that don’t exist or that are not object/categorical anymore
multiclass_cols = [c for c in multiclass_cols if c in df_model.columns]

# Identify which of these are still non-numeric (object/category) and need dummies
obj_to_dummy = [c for c in multiclass_cols if df_model[c].dtype == 'O']

if obj_to_dummy:
    dummies = pd.get_dummies(
        df_model[obj_to_dummy],
        prefix=obj_to_dummy,
        drop_first=False,
        dtype=int
    )
    df_model = pd.concat([df_model.drop(columns=obj_to_dummy), dummies], axis=1)

# 6) Final encoded frame ready for modeling
encoded_df = df_model.copy()
encoded_df.head()


Unnamed: 0,Education_Category,Mental Illness,Unknown Insurance Coverage,Insured_or_Not,Receiving Cash Assistance,Has_Public_Insurance,Has_Private_or_Other_Insurance,Confirmed_Medicaid_Managed,Chronic_disease_Summary,Other_Chronic_Illness_Summmary,...,"Other_testchronic_group_Summary_YES, HYPERLIPIDEMIA/HIGHBLOODPRESSURE/OBESITY","Heartchronic_Summary_NO, HEART CHRONIC ILLNESS",Heartchronic_Summary_UNKNOWN,"Heartchronic_Summary_YES, HEART CHRONIC ILLNESS",Disorder_summary_ALCOHOL/DRUG DISORDER,Disorder_summary_NO DISORDER,Disorder_summary_UNKNOWN,"Brainchronic_Summary_NO, BRAIN CHRONIC ILLNESS",Brainchronic_Summary_UNKNOWN,"Brainchronic_Summary_YES, BRAIN CHRONIC ILLNESS"
0,3,1,0,1,0,1,0,1,1,0,...,0,1,0,0,0,1,0,1,0,0
1,3,1,0,1,0,1,0,1,1,0,...,0,1,0,0,0,1,0,1,0,0
2,3,1,0,1,0,1,0,1,1,0,...,0,1,0,0,1,0,0,1,0,0
3,2,1,0,1,1,1,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
4,3,1,0,1,1,1,0,1,0,0,...,1,1,0,0,1,0,0,1,0,0


In [9]:
print(encoded_df.columns)

Index(['Education_Category', 'Mental Illness', 'Unknown Insurance Coverage',
       'Insured_or_Not', 'Receiving Cash Assistance', 'Has_Public_Insurance',
       'Has_Private_or_Other_Insurance', 'Confirmed_Medicaid_Managed',
       'Chronic_disease_Summary', 'Other_Chronic_Illness_Summmary',
       'Age Group_ADULT', 'Age Group_CHILD', 'Age Group_UNKNOWN',
       'Household Composition_COHABITATES WITH OTHERS',
       'Household Composition_LIVES ALONE',
       'Household Composition_NOT APPLICABLE', 'Household Composition_UNKNOWN',
       'Special Education Services_NO',
       'Special Education Services_NOT APPLICABLE',
       'Special Education Services_UNKNOWN', 'Special Education Services_YES',
       'No Chronic Med Condition_NO', 'No Chronic Med Condition_UNKNOWN',
       'No Chronic Med Condition_YES', 'Smokes_NO', 'Smokes_UNKNOWN',
       'Smokes_YES', 'Criminal Justice Status_NO',
       'Criminal Justice Status_UNKNOWN', 'Criminal Justice Status_YES',
       'Program_Categ

#### Model Building

In [10]:
# ================================================================
# Handling Class Imbalance in Classification
# ================================================================
# Problem:
#   Dataset is highly imbalanced → Baseline models predict majority class.
#
# Solution:
#   1. Use class_weight='balanced':
#       - Automatically adjusts weights inversely proportional to class frequencies.
#   2. Apply SMOTE (Synthetic Minority Over-sampling Technique):
#       - Creates synthetic samples for minority class.
#       - Applied only to training set (to prevent data leakage).
#
# Why Combine Both?
#   - SMOTE balances training data.
#   - class_weight ensures model still considers minority class importance.
#
# Expected Result:
#   - Higher recall and F1-score for minority class.
#   - Slight accuracy drop but fairer predictions.

In [11]:


# ================================================================
# 1. Prepare Data
# ================================================================
df = encoded_df.copy()

# Define target and features
TARGET = 'Mental Illness'
X = df.drop(columns=[TARGET])
y = df[TARGET]

# Split into train and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print("Original class distribution:")
print(y.value_counts(normalize=True))

# ================================================================
# 2. Baseline Logistic Regression (No SMOTE)
# ================================================================
baseline_lr = LogisticRegression(
    solver='liblinear',       # works well for small/medium data
    class_weight=None,        # No class weight for baseline
    random_state=42
)
baseline_lr.fit(X_train, y_train)

# Predictions
y_pred_baseline = baseline_lr.predict(X_test)

print("\n===== Baseline Logistic Regression Results =====")
print(classification_report(y_test, y_pred_baseline))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_baseline))

# ================================================================
# 3. Apply SMOTE (ONLY on training set)
# ================================================================
print("\nApplying SMOTE...")
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("After SMOTE class distribution:")
print(pd.Series(y_train_smote).value_counts())

# ================================================================
# 4. Logistic Regression with Class Weighting + SMOTE
# ================================================================
lr_balanced = LogisticRegression(
    solver='liblinear',         # suitable for binary class
    class_weight='balanced',    # handle imbalance
    max_iter=1000,              # ensure convergence
    random_state=42
)
lr_balanced.fit(X_train_smote, y_train_smote)

# Predictions
y_pred_smote = lr_balanced.predict(X_test)

# ================================================================
# 5. Results After SMOTE + Class Weight
# ================================================================
print("\n===== Logistic Regression After SMOTE + Class Weighting =====")
print(classification_report(y_test, y_pred_smote))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_smote))


Original class distribution:
Mental Illness
1    0.97127
0    0.02873
Name: proportion, dtype: float64

===== Baseline Logistic Regression Results =====
              precision    recall  f1-score   support

           0       0.60      0.07      0.13      1673
           1       0.97      1.00      0.99     56563

    accuracy                           0.97     58236
   macro avg       0.79      0.54      0.56     58236
weighted avg       0.96      0.97      0.96     58236


Confusion Matrix:
 [[  121  1552]
 [   80 56483]]

Applying SMOTE...
After SMOTE class distribution:
Mental Illness
1    131977
0    131977
Name: count, dtype: int64

===== Logistic Regression After SMOTE + Class Weighting =====
              precision    recall  f1-score   support

           0       0.18      0.34      0.23      1673
           1       0.98      0.95      0.97     56563

    accuracy                           0.94     58236
   macro avg       0.58      0.65      0.60     58236
weighted avg      

In [12]:
# Logistic Regression Hyperparameter Tuning with GridSearchCV
# Goal:
#   - Optimize Logistic Regression performance by tuning regularization and penalty parameters.
#
# Why Important?
#   - Default settings may underfit or overfit.
#   - Proper tuning ensures better generalization and handles class imbalance effectively.
#
# Strategy:
#   - Use 5-fold Cross Validation for robust performance estimation.
#   - Optimize for F1-score (better for imbalanced classes).
#
# Parameters Tuned:
#   - C: Inverse of regularization strength (controls overfitting vs underfitting).
#   - penalty: Type of regularization ('l1', 'l2') to prevent overfitting.
#   - solver: Algorithm to optimize the logistic regression model (must match penalty type).
#
# Additional Handling:
#   - class_weight='balanced' used to address class imbalance.
#   - max_iter increased to ensure convergence.


In [13]:
# ================================================================
# Optimized GridSearchCV for Logistic Regression
# - Focus on best params only (no unnecessary DataFrame outputs)
# ================================================================

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define parameter grid
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],          # Regularization strength
    'penalty': ['l1', 'l2'],          # Regularization type
    'solver': ['liblinear', 'saga']   # Compatible solvers
}

# Initialize GridSearchCV
grid_lr = GridSearchCV(
    estimator=LogisticRegression(
        class_weight='balanced',       # Handle imbalance
        max_iter=1000,                 # Ensure convergence
        random_state=123
    ),
    param_grid=param_grid_lr,
    scoring='f1',      # Focus on F1 for imbalance
    n_jobs=-1,         # Use all cores
    cv=3,              # 3-fold for speed
    verbose=2,
    refit=True
)

# Run Grid Search on SMOTE-resampled data
print("Running Grid Search for Logistic Regression...")
grid_lr.fit(X_train_smote, y_train_smote)

# Print the best parameters and score
print("\n Best Parameters Found:")
print(grid_lr.best_params_)
print(f" Best Cross-Validated F1 Score: {grid_lr.best_score_:.4f}")



Running Grid Search for Logistic Regression...
Fitting 3 folds for each of 16 candidates, totalling 48 fits

 Best Parameters Found:
{'C': 10, 'penalty': 'l2', 'solver': 'saga'}
 Best Cross-Validated F1 Score: 0.9515


In [14]:
# Get best model
best_lr_model = grid_lr.best_estimator_

# Evaluate Best Model on Test Data
y_pred_lr = best_lr_model.predict(X_test)

print("\n Classification Report on Test Data:")
print(classification_report(y_test, y_pred_lr))


 Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.19      0.33      0.24      1673
           1       0.98      0.96      0.97     56563

    accuracy                           0.94     58236
   macro avg       0.58      0.64      0.60     58236
weighted avg       0.96      0.94      0.95     58236



In [15]:
import joblib

# Save model to file
joblib.dump(best_lr_model, "best_lr_model.pkl")
print("Model saved as best_lr_model.pkl")


# Load model from file
loaded_model = joblib.load("best_lr_model.pkl")

# Use loaded model to predict
y_pred_loaded = loaded_model.predict(X_test)
print("Predictions using loaded model:")
print(y_pred_loaded[:10])

Model saved as best_lr_model.pkl
Predictions using loaded model:
[1 1 1 1 1 1 1 1 1 1]
