In [10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import sys
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score, recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from catboost import CatBoostClassifier
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier


In [11]:
def preprocess_data_index(train):
   """
   Preprocesses the training data and returns indices of valid samples after filtering.
   This function is similar to preprocess_data but includes additional data cleaning steps
   and returns only the indices of valid samples.
   """
   # Remove unnecessary identifier columns
   train = train.drop(['id', 'Name'], axis=1)
   
   # Combine Work and Academic Pressure into a single pressure metric
   # Takes the maximum value between the two types of pressure
   train['Pressure'] = train[['Work Pressure', 'Academic Pressure']].max(axis=1)
   train = train.drop(['Work Pressure', 'Academic Pressure'], axis=1)
   
   # Convert Gender to binary (1 for Male, 0 for Female)
   train['Gender'] = (train['Gender'] == 'Male').astype(int)
   
   # Set Profession to 'Student' where the person is a student
   # Note: Commented code shows alternative binary encoding approach
   # train['Working Professional or Student'] = (train['Working Professional or Student'] == 'Working Professional').astype(int)
   train.loc[train['Working Professional or Student'] == 'Student', 'Profession'] = 'Student'
   
   # Combine Study and Job Satisfaction into a single satisfaction metric
   train['Satisfaction'] = train[['Study Satisfaction', 'Job Satisfaction']].max(axis=1)
   train = train.drop(['Study Satisfaction', 'Job Satisfaction'], axis=1)
   
   # Convert Yes/No columns to binary (1 for Yes, 0 for No)
   train['Family History of Mental Illness'] = (train['Family History of Mental Illness'] == 'Yes').astype(int)
   train['Have you ever had suicidal thoughts ?'] = (train['Have you ever had suicidal thoughts ?'] == 'Yes').astype(int)
   
   # Remove City column (alternative would be one-hot encoding)
   # Commented code shows one-hot encoding approach:
   # train = pd.get_dummies(train, columns=['City']).astype(int)
   train = train.drop(['City'], axis=1)
   
   # Map dietary habits to numerical values and filter out invalid values
   diet_mapping = {'Moderate': 1.0, 'Unhealthy': 0.0, 'Healthy': 2.0}
   train = train[train['Dietary Habits'].isin(diet_mapping.keys())]  # Remove rows with invalid dietary habits
   train['Dietary Habits'] = train['Dietary Habits'].map(diet_mapping)
   
   # Filter professions with less than 10 samples to ensure statistical significance
   v = train["Profession"].value_counts() 
   train = train[train['Profession'].isin(v.index[v.gt(10)])]
   
   # Convert Profession column to one-hot encoded columns
   train = pd.get_dummies(train, columns=['Profession'])
   profession_cols = [col for col in train.columns if col.startswith('Profession_')]
   train[profession_cols] = train[profession_cols].astype(int)
   
   # Remove redundant column since profession is now one-hot encoded
   train = train.drop(['Working Professional or Student'], axis=1)
   
   # Filter degrees with less than 10 samples to ensure statistical significance
   v = train["Degree"].value_counts() 
   train = train[train['Degree'].isin(v.index[v.gt(10)])]
   
   # Convert Degree column to one-hot encoded columns
   train = pd.get_dummies(train, columns=['Degree'])
   degree_cols = [col for col in train.columns if col.startswith('Degree_')]
   train[degree_cols] = train[degree_cols].astype(int)
   
   # Map sleep duration ranges to their approximate middle values in hours
   # Also filter out invalid sleep duration values
   dict_sleep = {
       'Less than 5 hours': 4.0, 
       '5-6 hours': 5.5, 
       '6-7 hours': 6.5, 
       '7-8 hours': 7.5, 
       'More than 8 hours': 9.0,
       '2-3 hours': 2.5,
       '3-4 hours': 3.5,
       '4-5 hours': 4.5,
       '4-6 hours': 5.0
   }
   train = train[train['Sleep Duration'].isin(dict_sleep.keys())]
   train['Sleep Duration'] = train['Sleep Duration'].map(dict_sleep)
   
   # Fill missing CGPA values with mean and remove any remaining NA values
   train['CGPA'] = train['CGPA'].fillna(train['CGPA'].mean())
   train = train.dropna()
   
   # Return indices of valid samples after all filtering steps
   return train.index

In [12]:

def preprocess_data(train):
    # Remove unnecessary identifier columns
    train = train.drop(['id', 'Name'], axis=1)
    
    # Combine Work and Academic Pressure into a single pressure metric
    # Takes the maximum value between the two types of pressure
    train['Pressure'] = train[['Work Pressure', 'Academic Pressure']].max(axis=1)
    train = train.drop(['Work Pressure', 'Academic Pressure'], axis=1)
    
    # Convert Gender to binary (1 for Male, 0 for Female)
    train['Gender'] = (train['Gender'] == 'Male').astype(int)
    
    # Set Profession to 'Student' where the person is a student
    train.loc[train['Working Professional or Student'] == 'Student', 'Profession'] = 'Student'
    
    # Combine Study and Job Satisfaction into a single satisfaction metric
    # Takes the maximum value between the two types of satisfaction
    train['Satisfaction'] = train[['Study Satisfaction', 'Job Satisfaction']].max(axis=1)
    train = train.drop(['Study Satisfaction', 'Job Satisfaction'], axis=1)
    
    # Convert Yes/No columns to binary (1 for Yes, 0 for No)
    train['Family History of Mental Illness'] = (train['Family History of Mental Illness'] == 'Yes').astype(int)
    train['Have you ever had suicidal thoughts ?'] = (train['Have you ever had suicidal thoughts ?'] == 'Yes').astype(int)
    
    # Remove City column as it's not relevant for the analysis
    train = train.drop(['City'], axis=1)
    
    # Map dietary habits to numerical values
    # 0.0 = Unhealthy, 1.0 = Moderate, 2.0 = Healthy
    diet_mapping = {'Moderate': 1.0, 'Unhealthy': 0.0, 'Healthy': 2.0}
    train['Dietary Habits'] = train['Dietary Habits'].map(diet_mapping)
    
    # Convert Profession column to one-hot encoded columns
    train = pd.get_dummies(train, columns=['Profession'])
    
    # Ensure all profession columns are integer type
    profession_cols = [col for col in train.columns if col.startswith('Profession_')]
    train[profession_cols] = train[profession_cols].astype(int)
    
    # Remove redundant column since profession is now one-hot encoded
    train = train.drop(['Working Professional or Student'], axis=1)
    
    # Convert Degree column to one-hot encoded columns
    train = pd.get_dummies(train, columns=['Degree'])
    
    # Ensure all degree columns are integer type
    degree_cols = [col for col in train.columns if col.startswith('Degree_')]
    train[degree_cols] = train[degree_cols].astype(int)
    
    # Map sleep duration ranges to their approximate middle values in hours
    dict_sleep = {
        'Less than 5 hours': 4.0,
        '5-6 hours': 5.5,
        '6-7 hours': 6.5,
        '7-8 hours': 7.5,
        'More than 8 hours': 9.0,
        '2-3 hours': 2.5,
        '3-4 hours': 3.5,
        '4-5 hours': 4.5,
        '4-6 hours': 5.0
    }
    train['Sleep Duration'] = train['Sleep Duration'].map(dict_sleep)
    
    # Fill missing CGPA values with the mean CGPA
    train['CGPA'] = train['CGPA'].fillna(train['CGPA'].mean())
    
    return train

In [13]:
def load_and_preprocess_data(path='', PCA=False, n_components=0.95):
   """
   Loads data from CSV files, preprocesses it, and optionally applies PCA dimensionality reduction.
   
   Parameters:
   -----------
   path : str
       Path to the directory containing train.csv and test.csv files
   PCA : bool
       Whether to apply PCA dimensionality reduction
   n_components : float or int
       Number of components for PCA if PCA=True
       If float (0-1), represents the percentage of variance to preserve
       If int (>1), represents the number of components to keep
       
   Returns:
   --------
   X_train : DataFrame or numpy array
       Preprocessed training features 
   y_train : Series
       Training target values (Depression)
   X_test : DataFrame or numpy array
       Preprocessed test features
   """
   # Load the raw data
   train = pd.read_csv(path + 'train.csv')
   test = pd.read_csv(path + 'test.csv')
   
   # Separate features and target from training data
   X_train = train.drop('Depression', axis=1)
   y_train = train['Depression']
   
   # Get valid indices after preprocessing
   X_train_index = preprocess_data_index(X_train)
   # Filter training data to keep only valid samples
   X_train = X_train.loc[X_train_index]
   
   # Store length of training data for later splitting
   len_train = len(X_train)
   
   # Combine train and test for consistent preprocessing
   X = pd.concat([X_train, test], axis=0)
   
   # Apply preprocessing to combined data
   X = preprocess_data(X)
   
   # Split back into train and test
   X_train = X[:len_train]
   X_test = X[len_train:]
   
   # Fill missing values in test set with mean values
   X_test = X_test.fillna(X_test.mean())
   
   # Recombine features and target for final cleaning
   train = pd.concat([X_train, y_train], axis=1)
   train = train.dropna()
   
   # Final split of clean training data
   X_train = train.drop('Depression', axis=1)
   y_train = train['Depression']
   
   # Optionally apply PCA
   if PCA:
       pca = PCA(n_components=n_components)
       X_train = pca.fit_transform(X_train)
       X_test = pca.transform(X_test)
   
   return X_train, y_train, X_test

In [14]:
X_train, y_train, X_test = load_and_preprocess_data()

In [15]:
def SplitData(X_train = X_train, y_train = y_train):
    # Split the data into Student data and Professional data
    X_trainS = X_train[X_train['Profession_Student'] == 1]
    X_trainP = X_train[X_train['Profession_Student'] == 0]
    X_trainS = X_trainS.drop(['Profession_Student'], axis=1)
    X_trainP = X_trainP.drop(['Profession_Student'], axis=1)
    y_trainS = y_train[X_trainS.index]
    y_trainP = y_train[X_trainP.index]

    X_trainS2, X_valS, y_trainS2, y_valS = train_test_split(X_trainS, y_trainS, test_size=0.2, random_state=42)
    X_trainS2 = pd.concat([X_trainS2, X_trainP], axis=0)
    y_trainS2 = pd.concat([y_trainS2, y_trainP], axis=0)
    X_trainP2, X_valP, y_trainP2, y_valP = train_test_split(X_trainP, y_trainP, test_size=0.2, random_state=42)
    return X_trainS2, y_trainS2, X_valS, y_valS, X_trainP2, y_trainP2, X_valP, y_valP

In [16]:
def test_model (model, X_train = X_train, y_train = y_train, scoring = {'accuracy': accuracy_score, 'balanced_accuracy': balanced_accuracy_score, 'roc_auc': roc_auc_score}):
    X_trainS2, y_trainS2, X_valS, y_valS, X_trainP2, y_trainP2, X_valP, y_valP = SplitData(X_train, y_train)
    X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    sm = SMOTE(random_state=42)
    X_trainS2, y_trainS2 = sm.fit_resample(X_trainS2, y_trainS2)
    X_trainP2, y_trainP2 = sm.fit_resample(X_trainP2, y_trainP2)
    X_train2, y_train2 = sm.fit_resample(X_train2, y_train2)

    model1 = clone(model)
    model2 = clone(model)
    model3 = clone(model)
    model1.fit(X_trainS2, y_trainS2)
    model2.fit(X_trainP2, y_trainP2)
    model3.fit(X_train2, y_train2)
    y_predS = model1.predict(X_valS)
    y_predP = model2.predict(X_valP)
    Student_score = {}
    Professional_score = {}
    Combined_score = {}
    No_split_score = {}
    for name, value in scoring.items():
        tmp1 = value(y_valS, y_predS)
        Student_score[name] = tmp1
        print(f'Student {name} = {tmp1}')
        tmp2 = value(y_valP, y_predP)
        Professional_score[name] = tmp2
        print(f'Professional {name} = {tmp2}')
        y_pred_combined = np.concatenate((y_predS, y_predP))
        y_val_combined = np.concatenate((y_valS, y_valP))
        tmp3 = value(y_val_combined, y_pred_combined)
        print(f'Combined {name} = {tmp3}')
        Combined_score[name] = tmp3
        y_pred = model3.predict(X_val)
        tmp4 = value(y_val, y_pred)
        print(f'No split {name} = {tmp4}')
        No_split_score[name] = tmp4



    return Student_score, Professional_score, Combined_score, No_split_score



In [17]:
def Final_submition(model, X_test = X_test, test = pd.read_csv('test.csv')):
    testS = X_test[X_test['Profession_Student'] == 1]
    testP = X_test[X_test['Profession_Student'] == 0]
    testS = testS.drop(['Profession_Student'], axis=1)
    testP = testP.drop(['Profession_Student'], axis=1)
    
    X_trainS = X_train[X_train['Profession_Student'] == 1]
    X_trainP = X_train[X_train['Profession_Student'] == 0]
    X_trainS = X_trainS.drop(['Profession_Student'], axis=1)
    X_trainP = X_trainP.drop(['Profession_Student'], axis=1)
    y_trainS = y_train[X_trainS.index]
    y_trainP = y_train[X_trainP.index]
    X_train3 = X_train.drop(['Profession_Student'], axis=1)

    sm = SMOTE(random_state=42)
    X_train3, y_train = sm.fit_resample(X_train3, y_train)
    model.fit(X_train3, y_train)
    y_predS = model.predict(testS)
    X_trainP, y_trainP = sm.fit_resample(X_trainP, y_trainP)
    model.fit(X_trainP, y_trainP)
    y_predP = model.predict(testP)
    y_pred = np.zeros(len(X_test))
    y_pred[testS.index] = y_predS
    y_pred[testP.index] = y_predP

    submission = pd.DataFrame({'id': test['id'], 'Depression': y_pred})
    submission.to_csv('submission.csv', index=False)




In [18]:
model = LogisticRegression(max_iter=100)

Student_score, Professional_score, Combined_score, No_split_score = test_model(model)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Student accuracy = 0.7913074712643678
Professional accuracy = 0.9102243188601136
Combined accuracy = 0.8850884519019057
No split accuracy = 0.9215701161643004
Student balanced_accuracy = 0.7608741534000131
Professional balanced_accuracy = 0.9001196109594847
Combined balanced_accuracy = 0.9087636724404886
No split balanced_accuracy = 0.9210491613007161
Student roc_auc = 0.7608741534000132
Professional roc_auc = 0.9001196109594846
Combined roc_auc = 0.9087636724404885
No split roc_auc = 0.9210491613007161


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
def create_model_results():
    # Dictionary of models with their parameters
    models = {
        'RandomForest': RandomForestClassifier(
            max_depth=10, max_features=None, min_samples_leaf=4,
            min_samples_split=10, n_estimators=200, random_state=42
        ),
        'LogisticRegression': LogisticRegression(
            C=100, penalty='l1', solver='liblinear', max_iter=10000, random_state=42
        ),
        'KNN': KNeighborsClassifier(
            algorithm='ball_tree', metric='manhattan', n_neighbors=11, weights='uniform'
        ),
        'DecisionTree': DecisionTreeClassifier(
            criterion='entropy', max_depth=10, min_samples_leaf=4,
            min_samples_split=10, random_state=42
        ),
        'GradientBoosting': GradientBoostingClassifier(
            learning_rate=0.1, max_depth=3, n_estimators=300,
            subsample=0.8, random_state=42
        ),
        'XGBoost': XGBClassifier(
            colsample_bytree=1.0, learning_rate=0.1, max_depth=3,
            n_estimators=300, subsample=0.8, random_state=42
        ),
        'CatBoost': CatBoostClassifier(
            depth=4, iterations=200, l2_leaf_reg=5,
            learning_rate=0.2, random_state=42, verbose=False
        )
    }
    
    # Initialize lists to store results
    results = []
    
    # Metrics to evaluate
    scoring = {
        'accuracy': accuracy_score,
        'balanced_accuracy': balanced_accuracy_score,
        'f1': f1_score,
        'recall': recall_score,
        'roc_auc': roc_auc_score
    }
    
    # Test each model
    for model_name, model in models.items():
        print(f"Testing {model_name}...")
        student_score, prof_score, combined_score, no_split_score = test_model(
            model, X_train, y_train, scoring
        )
        
        # Store results
        results.append({
            'Model': model_name,
            'Hyperparameters': str(model.get_params()),
            'Balanced_accuracy_Student': student_score['balanced_accuracy'],
            'Balanced_accuracy_Professional': prof_score['balanced_accuracy'],
            'Balanced_accuracy_Combined': combined_score['balanced_accuracy'],
            'Accuracy_Student': student_score['accuracy'],
            'Accuracy_Professional': prof_score['accuracy'],
            'Accuracy_Combined': combined_score['accuracy'],
            'F1_Student': student_score['f1'],
            'F1_Professional': prof_score['f1'],
            'F1_Combined': combined_score['f1'],
            'Recall_Student': student_score['recall'],
            'Recall_Professional': prof_score['recall'],
            'Recall_Combined': combined_score['recall'],
            'ROC_AUC_Student': student_score['roc_auc'],
            'ROC_AUC_Professional': prof_score['roc_auc'],
            'ROC_AUC_Combined': combined_score['roc_auc']
        })
    
    results_df = pd.DataFrame(results)

    # Print best models for each metric
    metrics = ['Balanced_accuracy', 'Accuracy', 'F1', 'Recall', 'ROC_AUC']
    categories = ['Student', 'Professional', 'Combined']
    
    print("\nBest Models for Each Metric:")
    print("-" * 50)
    
    for metric in metrics:
        print(f"\n{metric}:")
        for category in categories:
            column = f"{metric}_{category}"
            best_idx = results_df[column].idxmax()
            best_model = results_df.loc[best_idx, 'Model']
            best_score = results_df.loc[best_idx, column]
            print(f"{category}: {best_model} (Score: {best_score:.4f})")
    
    
    return results_df

results_df = create_model_results()

print(results_df)

results_df.to_csv('model_comparison_results.csv', index=False)

In [21]:
results_df = pd.read_csv('model_comparison_results.csv')
results_df


Unnamed: 0,Model,Hyperparameters,Balanced_accuracy_Student,Balanced_accuracy_Professional,Balanced_accuracy_Combined,Accuracy_Student,Accuracy_Professional,Accuracy_Combined,F1_Student,F1_Professional,F1_Combined,Recall_Student,Recall_Professional,Recall_Combined,ROC_AUC_Student,ROC_AUC_Professional,ROC_AUC_Combined
0,RandomForest,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.81614,0.853895,0.899963,0.829921,0.949937,0.924569,0.859891,0.642857,0.79454,0.909262,0.74463,0.862789,0.81614,0.853895,0.899963
1,LogisticRegression,"{'C': 100, 'class_weight': None, 'dual': False...",0.75566,0.899093,0.910628,0.788254,0.909695,0.884026,0.841041,0.543108,0.734878,0.975907,0.887033,0.95082,0.75566,0.899093,0.910628
2,KNN,"{'algorithm': 'ball_tree', 'leaf_size': 30, 'm...",0.729456,0.861442,0.894672,0.761853,0.922162,0.888277,0.82052,0.551953,0.732382,0.948373,0.792363,0.904334,0.729456,0.861442,0.894672
3,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.806333,0.847767,0.892611,0.820223,0.944017,0.91785,0.851813,0.614773,0.778596,0.900188,0.738266,0.85448,0.806333,0.847767,0.892611
4,GradientBoosting,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",0.835599,0.819615,0.89581,0.844648,0.966641,0.940855,0.86888,0.702958,0.825532,0.896746,0.652347,0.827757,0.835599,0.819615,0.89581
5,XGBoost,"{'objective': 'binary:logistic', 'base_score':...",0.835803,0.824132,0.897332,0.845007,0.966737,0.941007,0.869302,0.706582,0.826524,0.897997,0.661893,0.83135,0.835803,0.824132,0.897332
6,CatBoost,"{'iterations': 200, 'learning_rate': 0.2, 'dep...",0.840087,0.80555,0.892489,0.84824,0.966785,0.941728,0.871326,0.693878,0.825983,0.895181,0.622116,0.8181,0.840087,0.80555,0.892489
