In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import sys
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler






In [45]:

def confusion(y_val, y_pred):
    # Assuming y_val and y_pred are defined
    cm = confusion_matrix(y_val, y_pred)

    # Plotting the confusion matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

def prediction_finale_split(modelS, modelP, X_trainS, y_trainS, X_trainP, y_trainP, X_testS, id_testS, X_testP, id_testP, oversamplingS = False, oversamplingP = False, threshold=0.5):
    if oversamplingS:
        smote = SMOTE()
        X_trainS, y_trainS = smote.fit_resample(X_trainS, y_trainS)
    if oversamplingP:
        smote = SMOTE()
        X_trainP, y_trainP = smote.fit_resample(X_trainP, y_trainP)
    X_testS = X_testS.fillna(X_testS.median())
    X_testP = X_testP.fillna(X_testP.median())
    modelS.fit(X_trainS, y_trainS)
    modelP.fit(X_trainP, y_trainP)
    y_predS = modelS.predict(X_testS)
    y_predP = modelP.predict(X_testP)
    # y_predS = (y_predS > threshold).astype(int)
    # y_predP = (y_predP > threshold).astype(int)
    subS = pd.DataFrame({'id': id_testS, 'Depression': y_predS})
    subP = pd.DataFrame({'id': id_testP, 'Depression': y_predP})

    sub = pd.concat([subS, subP], axis=0)
    sub = sub.sort_values(by='id')
    sub.to_csv('submission.csv', index=False)


def prediction_finale(model, X_train, y_train, X_test, id_test, oversampling = False, threshold=0.5):
    if oversampling:
        smote = SMOTE()
        X_train, y_train = smote.fit_resample(X_train, y_train)

    X_test = X_test.fillna(X_test.median())
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # y_predS = (y_predS > threshold).astype(int)
    # y_predP = (y_predP > threshold).astype(int)
    sub = pd.DataFrame({'id': id_test, 'Depression': y_pred})

    sub = sub.sort_values(by='id')
    sub.to_csv('submission.csv', index=False)
    

def test_model_split(modelS, modelP, X_trainS, y_trainS, X_trainP, y_trainP, oversamplingS = False, oversamplingP = False):
    if oversamplingS:
        smote = SMOTE()
        X_trainS, y_trainS = smote.fit_resample(X_trainS, y_trainS)
    if oversamplingP:
        smote = SMOTE()
        X_trainP, y_trainP = smote.fit_resample(X_trainP, y_trainP)
    X_trainS, X_valS, y_trainS, y_valS = train_test_split(X_trainS, y_trainS, test_size=0.2, random_state=42)
    X_trainP, X_valP, y_trainP, y_valP = train_test_split(X_trainP, y_trainP, test_size=0.2, random_state=42)
    modelS.fit(X_trainS, y_trainS)
    modelP.fit(X_trainP, y_trainP)
    y_predS = modelS.predict(X_valS)
    y_predP = modelP.predict(X_valP)
    print('Model S')
    print('Accuracy:', accuracy_score(y_valS, y_predS))
    print('Balanced accuracy:', balanced_accuracy_score(y_valS, y_predS))
    print('AUC:', roc_auc_score(y_valS, y_predS))
    print('Model P')
    print('Accuracy:', accuracy_score(y_valP, y_predP))
    print('Balanced accuracy:', balanced_accuracy_score(y_valP, y_predP))
    print('AUC:', roc_auc_score(y_valP, y_predP))
    # confusion(y_valS, y_predS)
    # confusion(y_valP, y_predP)

    y_pred = np.concatenate((y_predS, y_predP))
    y_val = np.concatenate((y_valS, y_valP))
    print('Combined')
    print('Accuracy:', accuracy_score(y_val, y_pred))
    print('Balanced accuracy:', balanced_accuracy_score(y_val, y_pred))
    print('AUC:', roc_auc_score(y_val, y_pred)) 


def test_model(model, X_train, y_train, oversampling = False):
    if oversampling:
        smote = SMOTE()
        X_train, y_train = smote.fit_resample(X_train, y_train)
   
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    print('Combined')
    print('Accuracy:', accuracy_score(y_val, y_pred))
    print('Balanced accuracy:', balanced_accuracy_score(y_val, y_pred))
    print('AUC:', roc_auc_score(y_val, y_pred))


    Val = X_val
    Val['label'] = y_val
    Val_S = Val[Val['Profession_Student'] == 1]
    Val_P = Val[Val['Profession_Student'] == 0]
    y_valS = Val_S['label']
    y_valP = Val_P['label']
    X_valS = Val_S.drop(columns=['label'])
    X_valP = Val_P.drop(columns=['label'])

    Pred = X_val
    Pred['label'] = y_pred
    Pred_S = Pred[Pred['Profession_Student'] == 1]
    Pred_P = Pred[Pred['Profession_Student'] == 0]
    y_predS = Pred_S['label']
    y_predP = Pred_P['label']
    X_predS = Pred_S.drop(columns=['label'])
    X_predP = Pred_P.drop(columns=['label'])



    print('Students')
    print('Accuracy:', accuracy_score(y_valS, y_predS))
    print('Balanced accuracy:', balanced_accuracy_score(y_valS, y_predS))
    print('AUC:', roc_auc_score(y_valS, y_predS))
    print('Professionals')
    print('Accuracy:', accuracy_score(y_valP, y_predP))
    print('Balanced accuracy:', balanced_accuracy_score(y_valP, y_predP))
    print('AUC:', roc_auc_score(y_valP, y_predP))
    # confusion(y_valS, y_predS)
    # confusion(y_valP, y_predP)

   
  
    

In [47]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [49]:
def preprocess_all_data(train, test, split=False):


    X_train = train.drop('Depression', axis=1)
    y_train = train['Depression']
    X = pd.concat([X_train, test], axis=0)
    len_train = len(X_train)
    len_test = len(test)
    len_X = len(X)
    print('len_train:', len_train)
    print('len_test:', len_test)
    print('len_X:', len_X)


    if split:




        X = X.drop(['Name'], axis=1)
        X['Pressure'] = X[['Work Pressure', 'Academic Pressure']].max(axis=1)
        X = X.drop(['Work Pressure', 'Academic Pressure'], axis=1)
        X['Gender'] = (X['Gender'] == 'Male').astype(int)
        X.loc[X['Working Professional or Student'] == 'Student', 'Profession'] = 'Student'
        X['Satisfaction'] = X[['Study Satisfaction', 'Job Satisfaction']].max(axis=1)
        X = X.drop(['Study Satisfaction', 'Job Satisfaction'], axis=1)
        X['Family History of Mental Illness'] = (X['Family History of Mental Illness'] == 'Yes').astype(int)
        X['Have you ever had suicidal thoughts ?'] = (X['Have you ever had suicidal thoughts ?'] == 'Yes').astype(int)


        X = X.drop(['City'], axis=1)


        # v = X["City"].value_counts() 
        # tmp = X[X['City'].isin(v.index[v.gt(10)])]
        # tmp = pd.get_dummies(tmp, columns=['City'])
        # tmp_cols = [col for col in tmp.columns if col.startswith('City_')]
        # X = pd.get_dummies(X, columns=['City'])
        # City_cols = [col for col in X.columns if col.startswith('City_')]
        # X[City_cols] = X[City_cols].astype(int)
        # X = X.drop(tmp_cols, axis=1)



        diet_mapping = {'Moderate': 1.0, 'Unhealthy': 0.0, 'Healthy': 2.0}
        X['Dietary Habits'] = X['Dietary Habits'].map(diet_mapping)
        v = X["Profession"].value_counts() 

        X = pd.get_dummies(X, columns=['Profession'])
        profession_cols = [col for col in X.columns if col.startswith('Profession_')]
        X[profession_cols] = X[profession_cols].astype(int)
        X = X.drop(['Working Professional or Student'], axis=1)
        v = X["Degree"].value_counts() 
        X = pd.get_dummies(X, columns=['Degree'])
        degree_cols = [col for col in X.columns if col.startswith('Degree_')]
        X[degree_cols] = X[degree_cols].astype(int)
        dict_sleep = {'Less than 5 hours': 4.0, '5-6 hours': 5.5, '6-7 hours': 6.5, '7-8 hours': 7.5, 'More than 8 hours': 9.0, '2-3 hours': 2.5, '3-4 hours': 3.5, '4-5 hours': 4.5, '4-6 hours': 5.0}
        X['Sleep Duration'] = X['Sleep Duration'].map(dict_sleep)
        X['CGPA'] = X['CGPA'].fillna(X['CGPA'].mean())

        X_train = X[:len_train]
        X_test = X[len_train:]

        X_trainS = X_train[X_train['Profession_Student'] == 1]
        X_trainP = X_train[X_train['Profession_Student'] == 0]
        X_trainP = X_trainP.drop(['Profession_Student'], axis=1)
        X_trainP = X_trainP.drop(['CGPA'], axis=1)
        X_trainS = X_trainS.drop(profession_cols, axis=1)

        X_testS = X_test[X_test['Profession_Student'] == 1]
        X_testP = X_test[X_test['Profession_Student'] == 0]
        X_testP = X_testP.drop(['Profession_Student'], axis=1)
        X_testP = X_testP.drop(['CGPA'], axis=1)
        X_testS = X_testS.drop(profession_cols, axis=1)
        

        y_trainS = y_train[X_trainS.index]
        y_trainP = y_train[X_trainP.index]

        id_testS = X_testS['id']
        id_testP = X_testP['id']
        X_trainS = X_trainS.drop(['id'], axis=1)
        X_trainP = X_trainP.drop(['id'], axis=1)
        X_testS = X_testS.drop(['id'], axis=1)
        X_testP = X_testP.drop(['id'], axis=1)

        train_S = pd.concat([X_trainS, y_trainS], axis=1)
        train_S = train_S.dropna()
        X_trainS = train_S.drop('Depression', axis=1)
        y_trainS = train_S['Depression']

        train_P = pd.concat([X_trainP, y_trainP], axis=1)
        train_P = train_P.dropna()
        X_trainP = train_P.drop('Depression', axis=1)
        y_trainP = train_P['Depression']
        

        


        return X_trainS, y_trainS, X_trainP, y_trainP, X_testS, id_testS, X_testP, id_testP



    else :
        X = X.drop(['Name'], axis=1)
        X['Pressure'] = X[['Work Pressure', 'Academic Pressure']].max(axis=1)
        X = X.drop(['Work Pressure', 'Academic Pressure'], axis=1)
        X['Gender'] = (X['Gender'] == 'Male').astype(int)
        X.loc[X['Working Professional or Student'] == 'Student', 'Profession'] = 'Student'
        X['Satisfaction'] = X[['Study Satisfaction', 'Job Satisfaction']].max(axis=1)
        X = X.drop(['Study Satisfaction', 'Job Satisfaction'], axis=1)
        X['Family History of Mental Illness'] = (X['Family History of Mental Illness'] == 'Yes').astype(int)
        X['Have you ever had suicidal thoughts ?'] = (X['Have you ever had suicidal thoughts ?'] == 'Yes').astype(int)


        X = X.drop(['City'], axis=1)


        # v = X["City"].value_counts() 
        # tmp = X[X['City'].isin(v.index[v.gt(10)])]
        # tmp = pd.get_dummies(tmp, columns=['City'])
        # tmp_cols = [col for col in tmp.columns if col.startswith('City_')]
        # X = pd.get_dummies(X, columns=['City'])
        # City_cols = [col for col in X.columns if col.startswith('City_')]
        # X[City_cols] = X[City_cols].astype(int)
        # X = X.drop(tmp_cols, axis=1)



        diet_mapping = {'Moderate': 1.0, 'Unhealthy': 0.0, 'Healthy': 2.0}
        X['Dietary Habits'] = X['Dietary Habits'].map(diet_mapping)
        v = X["Profession"].value_counts() 

        X = pd.get_dummies(X, columns=['Profession'])
        profession_cols = [col for col in X.columns if col.startswith('Profession_')]
        X[profession_cols] = X[profession_cols].astype(int)
        X = X.drop(['Working Professional or Student'], axis=1)
        v = X["Degree"].value_counts() 
        X = pd.get_dummies(X, columns=['Degree'])
        degree_cols = [col for col in X.columns if col.startswith('Degree_')]
        X[degree_cols] = X[degree_cols].astype(int)
        dict_sleep = {'Less than 5 hours': 4.0, '5-6 hours': 5.5, '6-7 hours': 6.5, '7-8 hours': 7.5, 'More than 8 hours': 9.0, '2-3 hours': 2.5, '3-4 hours': 3.5, '4-5 hours': 4.5, '4-6 hours': 5.0}
        X['Sleep Duration'] = X['Sleep Duration'].map(dict_sleep)
        X['CGPA'] = X['CGPA'].fillna(X['CGPA'].mean())

        X_train = X[:len_train]
        X_test = X[len_train:]
        X_train = X_train.drop(['id'], axis=1)
        id_test = X_test['id']
        X_test = X_test.drop(['id'], axis=1)

        train = pd.concat([X_train, y_train], axis=1)
        train = train.dropna()
        X_train = train.drop('Depression', axis=1)
        y_train = train['Depression']


        return X_train, X_test, y_train, id_test
    

X_train, X_test, y_train, id_test = preprocess_all_data(train, test, split=False)

len_train: 140700
len_test: 93800
len_X: 234500


In [31]:
# train = pd.read_csv('train.csv')
# test = pd.read_csv('test.csv')

# X_train = train.drop('Depression', axis=1)
# y_train = train['Depression']
# X = pd.concat([X_train, test], axis=0)
# len_train = len(X_train)
# len_test = len(test)
# len_X = len(X)
# print('len_train:', len_train)
# print('len_test:', len_test)
# print('len_X:', len_X)



# X = X.drop(['Name'], axis=1)
# X['Pressure'] = X[['Work Pressure', 'Academic Pressure']].max(axis=1)
# X = X.drop(['Work Pressure', 'Academic Pressure'], axis=1)
# X['Gender'] = (X['Gender'] == 'Male').astype(int)
# X.loc[X['Working Professional or Student'] == 'Student', 'Profession'] = 'Student'
# X['Satisfaction'] = X[['Study Satisfaction', 'Job Satisfaction']].max(axis=1)
# X = X.drop(['Study Satisfaction', 'Job Satisfaction'], axis=1)
# X['Family History of Mental Illness'] = (X['Family History of Mental Illness'] == 'Yes').astype(int)
# X['Have you ever had suicidal thoughts ?'] = (X['Have you ever had suicidal thoughts ?'] == 'Yes').astype(int)


# X = X.drop(['City'], axis=1)


# # v = X["City"].value_counts() 
# # tmp = X[X['City'].isin(v.index[v.gt(10)])]
# # tmp = pd.get_dummies(tmp, columns=['City'])
# # tmp_cols = [col for col in tmp.columns if col.startswith('City_')]
# # X = pd.get_dummies(X, columns=['City'])
# # City_cols = [col for col in X.columns if col.startswith('City_')]
# # X[City_cols] = X[City_cols].astype(int)
# # X = X.drop(tmp_cols, axis=1)



# diet_mapping = {'Moderate': 1.0, 'Unhealthy': 0.0, 'Healthy': 2.0}
# X['Dietary Habits'] = X['Dietary Habits'].map(diet_mapping)
# v = X["Profession"].value_counts() 

# X = pd.get_dummies(X, columns=['Profession'])
# profession_cols = [col for col in X.columns if col.startswith('Profession_')]
# X[profession_cols] = X[profession_cols].astype(int)
# X = X.drop(['Working Professional or Student'], axis=1)
# v = X["Degree"].value_counts() 
# X = pd.get_dummies(X, columns=['Degree'])
# degree_cols = [col for col in X.columns if col.startswith('Degree_')]
# X[degree_cols] = X[degree_cols].astype(int)
# dict_sleep = {'Less than 5 hours': 4.0, '5-6 hours': 5.5, '6-7 hours': 6.5, '7-8 hours': 7.5, 'More than 8 hours': 9.0, '2-3 hours': 2.5, '3-4 hours': 3.5, '4-5 hours': 4.5, '4-6 hours': 5.0}
# X['Sleep Duration'] = X['Sleep Duration'].map(dict_sleep)
# X['CGPA'] = X['CGPA'].fillna(X['CGPA'].mean())

# X_train = X[:len_train]
# X_test = X[len_train:]

# X_trainS = X_train[X_train['Profession_Student'] == 1]
# X_trainP = X_train[X_train['Profession_Student'] == 0]
# X_trainP = X_trainP.drop(['Profession_Student'], axis=1)
# X_trainP = X_trainP.drop(['CGPA'], axis=1)
# X_trainS = X_trainS.drop(profession_cols, axis=1)

# X_testS = X_test[X_test['Profession_Student'] == 1]
# X_testP = X_test[X_test['Profession_Student'] == 0]
# X_testP = X_testP.drop(['Profession_Student'], axis=1)
# X_testP = X_testP.drop(['CGPA'], axis=1)
# X_testS = X_testS.drop(profession_cols, axis=1)

# X_trainSS = X_train.drop(profession_cols, axis=1)
# X_trainSS = X_trainSS.drop(['id'], axis=1)
# X_trainPP = X_train.drop(['Profession_Student'], axis=1)
# X_trainPP = X_trainPP.drop(['CGPA'], axis=1)
# X_trainPP = X_trainPP.drop(['id'], axis=1)


# y_trainS = y_train[X_trainS.index]
# y_trainP = y_train[X_trainP.index]

# id_testS = X_testS['id']
# id_testP = X_testP['id']
# X_trainS = X_trainS.drop(['id'], axis=1)
# X_trainP = X_trainP.drop(['id'], axis=1)
# X_testS = X_testS.drop(['id'], axis=1)
# X_testP = X_testP.drop(['id'], axis=1)

# train_S = pd.concat([X_trainS, y_trainS], axis=1)
# train_S = train_S.dropna()
# X_trainS = train_S.drop('Depression', axis=1)
# y_trainS = train_S['Depression']

# train_P = pd.concat([X_trainP, y_trainP], axis=1)
# train_P = train_P.dropna()
# X_trainP = train_P.drop('Depression', axis=1)
# y_trainP = train_P['Depression']

# trainSS = pd.concat([X_trainSS, y_train], axis=1)
# trainSS = trainSS.dropna()
# X_trainSS = trainSS.drop('Depression', axis=1)
# y_trainSS = trainSS['Depression']

# trainPP = pd.concat([X_trainPP, y_train], axis=1)
# trainPP = trainPP.dropna()
# X_trainPP = trainPP.drop('Depression', axis=1)
# y_trainPP = trainPP['Depression']


        

In [43]:
model = LogisticRegression(C=1, max_iter=100, penalty ='l1', solver = 'liblinear')
# Sans Oversampling
print ('Sans Oversampling')
test_model(model, X_train, y_train, oversampling = False)

# Avec Oversampling
print ('Avec Oversampling')
test_model(model, X_train, y_train, oversampling = True)

Sans Oversampling
Combined
Accuracy: 0.9371977240398293
Balanced accuracy: 0.887941334559768
AUC: 0.887941334559768
Students
Accuracy: 0.8467986445514536
Balanced accuracy: 0.8379361565243717
AUC: 0.8379361565243717
Professionals
Accuracy: 0.9597121663039133
Balanced accuracy: 0.8304089976680162
AUC: 0.8304089976680162
Avec Oversampling
Combined
Accuracy: 0.950722433460076
Balanced accuracy: 0.9506666359896718
AUC: 0.9506666359896718


  Val['label'] = y_val


Students
Accuracy: 0.9334065799914546
Balanced accuracy: 0.8363907484235522
AUC: 0.8363907484235522
Professionals
Accuracy: 0.9602928277444167
Balanced accuracy: 0.9523258073607664
AUC: 0.9523258073607664


In [50]:
modelS = GradientBoostingClassifier(learning_rate=0.3, max_depth=3, min_samples_split=4, n_estimators=200, subsample=1.0)
modelP = GradientBoostingClassifier(learning_rate=0.3, max_depth=3, min_samples_split=4, n_estimators=200, subsample=1.0)
# Sans Oversampling
X_trainS, y_trainS, X_trainP, y_trainP, X_testS, id_testS, X_testP, id_testP = preprocess_all_data(train, test, split=True)
print ('Sans Oversampling')
test_model_split(modelS, modelP, X_trainS, y_trainS, X_trainP, y_trainP, oversamplingS = False, oversamplingP = False)

# Avec Oversampling
print ('Avec Oversampling')
test_model_split(modelS, modelP, X_trainS, y_trainS, X_trainP, y_trainP, oversamplingS = True, oversamplingP = True)

len_train: 140700
len_test: 93800
len_X: 234500
Sans Oversampling
Model S
Accuracy: 0.846457399103139
Balanced accuracy: 0.8390055722019462
AUC: 0.8390055722019462
Model P
Accuracy: 0.9607025636476537
Balanced accuracy: 0.8371604536198967
AUC: 0.8371604536198967
Combined
Accuracy: 0.9380534120408236
Balanced accuracy: 0.8893272431654513
AUC: 0.8893272431654514
Avec Oversampling
Model S
Accuracy: 0.8775854144323579
Balanced accuracy: 0.8775941450604887
AUC: 0.8775941450604887
Model P
Accuracy: 0.9769334814743249
Balanced accuracy: 0.976939127924269
AUC: 0.9769391279242688
Combined
Accuracy: 0.9634042020488639
Balanced accuracy: 0.9634046717620951
AUC: 0.9634046717620951


In [26]:
modelS = GradientBoostingClassifier(learning_rate=0.3, max_depth=3, min_samples_split=4, n_estimators=200, subsample=1.0)
modelP = GradientBoostingClassifier(learning_rate=0.3, max_depth=3, min_samples_split=4, n_estimators=200, subsample=1.0)


X_trainS, X_valS, y_trainS, y_valS = train_test_split(X_trainS, y_trainS, test_size=0.2, random_state=42)
X_train_filteredS = X_trainSS[~X_trainSS.index.isin(X_valS.index)]
y_train_filteredS = y_trainSS[~y_trainSS.index.isin(y_valS.index)]
X_trainP, X_valP, y_trainP, y_valP = train_test_split(X_trainP, y_trainP, test_size=0.2, random_state=42)
modelS.fit(X_train_filteredS, y_train_filteredS)
modelP.fit(X_trainP, y_trainP)
y_predS = modelS.predict(X_valS)
y_predP = modelP.predict(X_valP)
print('Model S')
print('Accuracy:', accuracy_score(y_valS, y_predS))
print('Balanced accuracy:', balanced_accuracy_score(y_valS, y_predS))
print('AUC:', roc_auc_score(y_valS, y_predS))
print('Model P')
print('Accuracy:', accuracy_score(y_valP, y_predP))
print('Balanced accuracy:', balanced_accuracy_score(y_valP, y_predP))
print('AUC:', roc_auc_score(y_valP, y_predP))
# confusion(y_valS, y_predS)
# confusion(y_valP, y_predP)

y_pred = np.concatenate((y_predS, y_predP))
y_val = np.concatenate((y_valS, y_valP))
print('Combined')
print('Accuracy:', accuracy_score(y_val, y_pred))
print('Balanced accuracy:', balanced_accuracy_score(y_val, y_pred))
print('AUC:', roc_auc_score(y_val, y_pred)) 


Model S
Accuracy: 0.8466367713004485
Balanced accuracy: 0.839814853725404
AUC: 0.839814853725404
Model P
Accuracy: 0.9607025636476537
Balanced accuracy: 0.8371604536198967
AUC: 0.8371604536198967
Combined
Accuracy: 0.9380889726538886
Balanced accuracy: 0.8885924092024939
AUC: 0.8885924092024939


In [10]:
# Best Gradient Boosting parameters: {'learning_rate': 0.3, 'max_depth': 3, 'min_samples_split': 4, 'n_estimators': 200, 'subsample': 1.0}

gbS = GradientBoostingClassifier(learning_rate=0.3, max_depth=3, min_samples_split=4, n_estimators=200, subsample=1.0)
gbP = GradientBoostingClassifier(learning_rate=0.3, max_depth=3, min_samples_split=4, n_estimators=200, subsample=1.0)

test_model(gbS, gbP, X_trainS, y_trainS, X_trainP, y_trainP, oversamplingS=False, oversamplingP=False)

Model S
Accuracy: 0.9380867709815078
Balanced accuracy: 0.8906360702578742
AUC: 0.8906360702578742
Model P
Accuracy: 0.9607025636476537
Balanced accuracy: 0.8371604536198967
AUC: 0.8371604536198967
Combined
Accuracy: 0.948150633560968
Balanced accuracy: 0.8784134157368149
AUC: 0.8784134157368149


In [18]:
gbS = GradientBoostingClassifier(learning_rate=0.3, max_depth=3, min_samples_split=4, n_estimators=200, subsample=1.0)
gbP = GradientBoostingClassifier(learning_rate=0.3, max_depth=3, min_samples_split=4, n_estimators=200, subsample=1.0)
# Droping Professions
prediction_finale(gbS, gbP, X_trainS, y_trainS, X_trainP, y_trainP, X_testS, id_testS, X_testP, id_testP, oversamplingS=False, oversamplingP=False, threshold=0.5)

In [46]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')




X_train = train.drop('Depression', axis=1)
y_train = train['Depression']
X = pd.concat([X_train, test], axis=0)
len_train = len(X_train)
len_test = len(test)
len_X = len(X)


#X = X.drop(['id', 'Name'], axis=1)
X = X.drop(['Name'], axis=1)
X['Pressure'] = X[['Work Pressure', 'Academic Pressure']].max(axis=1)
X = X.drop(['Work Pressure', 'Academic Pressure'], axis=1)
X['Gender'] = (X['Gender'] == 'Male').astype(int)
X.loc[X['Working Professional or Student'] == 'Student', 'Profession'] = 'Student'
X['Satisfaction'] = X[['Study Satisfaction', 'Job Satisfaction']].max(axis=1)
X = X.drop(['Study Satisfaction', 'Job Satisfaction'], axis=1)
X['Family History of Mental Illness'] = (X['Family History of Mental Illness'] == 'Yes').astype(int)
X['Have you ever had suicidal thoughts ?'] = (X['Have you ever had suicidal thoughts ?'] == 'Yes').astype(int)


X = X.drop(['City'], axis=1)


# v = X["City"].value_counts() 
# tmp = X[X['City'].isin(v.index[v.gt(50)])]
# tmp = pd.get_dummies(tmp, columns=['City'])
# tmp_cols = [col for col in tmp.columns if col.startswith('City_')]
# X = pd.get_dummies(X, columns=['City'])
# City_cols = [col for col in X.columns if col.startswith('City_')]
# X[City_cols] = X[City_cols].astype(int)
# X = X.drop(tmp_cols, axis=1)



diet_mapping = {'Moderate': 1.0, 'Unhealthy': 0.0, 'Healthy': 2.0}
X['Dietary Habits'] = X['Dietary Habits'].map(diet_mapping)
v = X["Profession"].value_counts() 

X = pd.get_dummies(X, columns=['Profession'])
profession_cols = [col for col in X.columns if col.startswith('Profession_')]
X[profession_cols] = X[profession_cols].astype(int)
prof_counts = X[profession_cols].sum()
remove_cols = prof_counts[prof_counts <= 20].index

X = X.drop(remove_cols, axis=1)



X = X.drop(['Working Professional or Student'], axis=1)
v = X["Degree"].value_counts() 
X = pd.get_dummies(X, columns=['Degree'])
degree_cols = [col for col in X.columns if col.startswith('Degree_')]
X[degree_cols] = X[degree_cols].astype(int)

degree_counts = X[degree_cols].sum()
remove_cols = degree_counts[degree_counts <= 20].index

X = X.drop(remove_cols, axis=1)

dict_sleep = {'Less than 5 hours': 4.0, '5-6 hours': 5.5, '6-7 hours': 6.5, '7-8 hours': 7.5, 'More than 8 hours': 9.0, '2-3 hours': 2.5, '3-4 hours': 3.5, '4-5 hours': 4.5, '4-6 hours': 5.0}
X['Sleep Duration'] = X['Sleep Duration'].map(dict_sleep)

# todo after the split
X['CGPA'] = X['CGPA'].fillna(X['CGPA'].median())


scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

X_train = X[:len_train]
X_test = X[len_train:]
X_train = X_train.drop(['id'], axis=1)
id_test = X_test['id']
X_test = X_test.drop(['id'], axis=1)

train = pd.concat([X_train, y_train], axis=1)
train = train.dropna()
X_train = train.drop('Depression', axis=1)
y_train = train['Depression']


X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

gb = GradientBoostingClassifier(learning_rate=0.3, max_depth=3, min_samples_split=4, n_estimators=200, subsample=1.0, random_state=42)

gb.fit(X_train2, y_train2)

y_pred = gb.predict(X_val)

print('Accuracy:', accuracy_score(y_val, y_pred))


Accuracy: 0.9385490753911807


In [44]:
# get only the Gender = 1
X_trainM = X_train[X_train['Gender'] == 1]
X_trainF = X_train[X_train['Gender'] == 0]
y_trainM = y_train[X_trainM.index]
y_trainF = y_train[X_trainF.index]

X_trainM, X_valM, y_trainM, y_valM = train_test_split(X_trainM, y_trainM, test_size=0.2, random_state=42)
X_trainF, X_valF, y_trainF, y_valF = train_test_split(X_trainF, y_trainF, test_size=0.2, random_state=42)

gb.fit(X_trainM, y_trainM)
y_predM = gb.predict(X_valM)
print('Accuracy:', accuracy_score(y_valM, y_predM))

gb.fit(X_trainF, y_trainF)
y_predF = gb.predict(X_valF)
print('Accuracy:', accuracy_score(y_valF, y_predF))


Accuracy: 0.9388361428663695
Accuracy: 0.9393052148452956


In [4]:
def preprocess_data(train):
    train = train.drop(['id', 'Name'], axis=1)
    train['Pressure'] = train[['Work Pressure', 'Academic Pressure']].max(axis=1)
    train = train.drop(['Work Pressure', 'Academic Pressure'], axis=1)
    # encode gender in 1 and 0 (1 for male and 0 for Female)
    train['Gender'] = (train['Gender'] == 'Male').astype(int)
    # For Working Status (Student = 0, Working Professional = 1)
    # train['Working Professional or Student'] = (train['Working Professional or Student'] == 'Working Professional').astype(int)
    train.loc[train['Working Professional or Student'] == 'Student', 'Profession'] = 'Student'
    train['Satisfaction'] = train[['Study Satisfaction', 'Job Satisfaction']].max(axis=1)
    train = train.drop(['Study Satisfaction', 'Job Satisfaction'], axis=1)
    train['Family History of Mental Illness'] = (train['Family History of Mental Illness'] == 'Yes').astype(int)
    train['Have you ever had suicidal thoughts ?'] = (train['Have you ever had suicidal thoughts ?'] == 'Yes').astype(int)
    # we can either drop City or encode it in one hot encoding
    # one hot encoding
    #train = pd.get_dummies(train, columns=['City']).astype(int)
    # drop city
    train = train.drop(['City'], axis=1)
    diet_mapping = {'Moderate': 1.0, 'Unhealthy': 0.0, 'Healthy': 2.0}
    train = train[train['Dietary Habits'].isin(diet_mapping.keys())]
    train['Dietary Habits'] = train['Dietary Habits'].map(diet_mapping)
    v = train["Profession"].value_counts() 
    # keep only the profession with more than 10 samples
    train = train[train['Profession'].isin(v.index[v.gt(10)])]
    # one hot encoding
    train = pd.get_dummies(train, columns=['Profession'])
    profession_cols = [col for col in train.columns if col.startswith('Profession_')]
    train[profession_cols] = train[profession_cols].astype(int)
    train = train.drop(['Working Professional or Student'], axis=1)
    v = train["Degree"].value_counts() 
    train = train[train['Degree'].isin(v.index[v.gt(10)])]
    # one hot encoding
    train = pd.get_dummies(train, columns=['Degree'])
    degree_cols = [col for col in train.columns if col.startswith('Degree_')]
    train[degree_cols] = train[degree_cols].astype(int)
    dict_sleep = {'Less than 5 hours': 4.0, '5-6 hours': 5.5, '6-7 hours': 6.5, '7-8 hours': 7.5, 'More than 8 hours': 9.0, '2-3 hours': 2.5, '3-4 hours': 3.5, '4-5 hours': 4.5, '4-6 hours': 5.0}
    train = train[train['Sleep Duration'].isin(dict_sleep.keys())]
    train['Sleep Duration'] = train['Sleep Duration'].map(dict_sleep)
    train['CGPA'] = train['CGPA'].fillna(train['CGPA'].mean())
    train = train.dropna()
    return train

train = pd.read_csv('train.csv')
train = preprocess_data(train)
X_train = train.drop('Depression', axis=1)
y_train = train['Depression']

X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

gb = GradientBoostingClassifier(learning_rate=0.3, max_depth=3, min_samples_split=4, n_estimators=200, subsample=1.0, random_state=42)

gb.fit(X_train2, y_train2)

y_pred = gb.predict(X_val)

print('Accuracy:', accuracy_score(y_val, y_pred))


Accuracy: 0.9421835851491914
