In [88]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import sys
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from catboost import CatBoostClassifier






In [24]:
def preprocess_data(train):
    train = train.drop(['id', 'Name'], axis=1)
    train['Pressure'] = train[['Work Pressure', 'Academic Pressure']].max(axis=1)
    train = train.drop(['Work Pressure', 'Academic Pressure'], axis=1)
    # encode gender in 1 and 0 (1 for male and 0 for Female)
    train['Gender'] = (train['Gender'] == 'Male').astype(int)
    # For Working Status (Student = 0, Working Professional = 1)
    # train['Working Professional or Student'] = (train['Working Professional or Student'] == 'Working Professional').astype(int)
    train.loc[train['Working Professional or Student'] == 'Student', 'Profession'] = 'Student'
    train['Satisfaction'] = train[['Study Satisfaction', 'Job Satisfaction']].max(axis=1)
    train = train.drop(['Study Satisfaction', 'Job Satisfaction'], axis=1)
    train['Family History of Mental Illness'] = (train['Family History of Mental Illness'] == 'Yes').astype(int)
    train['Have you ever had suicidal thoughts ?'] = (train['Have you ever had suicidal thoughts ?'] == 'Yes').astype(int)
    # we can either drop City or encode it in one hot encoding
    # one hot encoding
    #train = pd.get_dummies(train, columns=['City']).astype(int)
    # drop city
    train = train.drop(['City'], axis=1)
    diet_mapping = {'Moderate': 1.0, 'Unhealthy': 0.0, 'Healthy': 2.0}
    #todo train = train[train['Dietary Habits'].isin(diet_mapping.keys())]
    train['Dietary Habits'] = train['Dietary Habits'].map(diet_mapping)
    v = train["Profession"].value_counts() 
    # keep only the profession with more than 10 samples
    #todo train = train[train['Profession'].isin(v.index[v.gt(10)])]
    # one hot encoding
    train = pd.get_dummies(train, columns=['Profession'])
    profession_cols = [col for col in train.columns if col.startswith('Profession_')]
    train[profession_cols] = train[profession_cols].astype(int)
    train = train.drop(['Working Professional or Student'], axis=1)
    v = train["Degree"].value_counts() 
    #todo train = train[train['Degree'].isin(v.index[v.gt(10)])]
    # one hot encoding
    train = pd.get_dummies(train, columns=['Degree'])
    degree_cols = [col for col in train.columns if col.startswith('Degree_')]
    train[degree_cols] = train[degree_cols].astype(int)
    dict_sleep = {'Less than 5 hours': 4.0, '5-6 hours': 5.5, '6-7 hours': 6.5, '7-8 hours': 7.5, 'More than 8 hours': 9.0, '2-3 hours': 2.5, '3-4 hours': 3.5, '4-5 hours': 4.5, '4-6 hours': 5.0}
    #todo train = train[train['Sleep Duration'].isin(dict_sleep.keys())]
    train['Sleep Duration'] = train['Sleep Duration'].map(dict_sleep)
    train['CGPA'] = train['CGPA'].fillna(train['CGPA'].mean())
    #train = train.dropna()
    return train




In [25]:
def preprocess_data_index(train):
    train = train.drop(['id', 'Name'], axis=1)
    train['Pressure'] = train[['Work Pressure', 'Academic Pressure']].max(axis=1)
    train = train.drop(['Work Pressure', 'Academic Pressure'], axis=1)
    # encode gender in 1 and 0 (1 for male and 0 for Female)
    train['Gender'] = (train['Gender'] == 'Male').astype(int)
    # For Working Status (Student = 0, Working Professional = 1)
    # train['Working Professional or Student'] = (train['Working Professional or Student'] == 'Working Professional').astype(int)
    train.loc[train['Working Professional or Student'] == 'Student', 'Profession'] = 'Student'
    train['Satisfaction'] = train[['Study Satisfaction', 'Job Satisfaction']].max(axis=1)
    train = train.drop(['Study Satisfaction', 'Job Satisfaction'], axis=1)
    train['Family History of Mental Illness'] = (train['Family History of Mental Illness'] == 'Yes').astype(int)
    train['Have you ever had suicidal thoughts ?'] = (train['Have you ever had suicidal thoughts ?'] == 'Yes').astype(int)
    # we can either drop City or encode it in one hot encoding
    # one hot encoding
    #train = pd.get_dummies(train, columns=['City']).astype(int)
    # drop city
    train = train.drop(['City'], axis=1)
    diet_mapping = {'Moderate': 1.0, 'Unhealthy': 0.0, 'Healthy': 2.0}
    train = train[train['Dietary Habits'].isin(diet_mapping.keys())]
    train['Dietary Habits'] = train['Dietary Habits'].map(diet_mapping)
    v = train["Profession"].value_counts() 
    # keep only the profession with more than 10 samples
    train = train[train['Profession'].isin(v.index[v.gt(10)])]
    # one hot encoding
    train = pd.get_dummies(train, columns=['Profession'])
    profession_cols = [col for col in train.columns if col.startswith('Profession_')]
    train[profession_cols] = train[profession_cols].astype(int)
    train = train.drop(['Working Professional or Student'], axis=1)
    v = train["Degree"].value_counts() 
    train = train[train['Degree'].isin(v.index[v.gt(10)])]
    # one hot encoding
    train = pd.get_dummies(train, columns=['Degree'])
    degree_cols = [col for col in train.columns if col.startswith('Degree_')]
    train[degree_cols] = train[degree_cols].astype(int)
    dict_sleep = {'Less than 5 hours': 4.0, '5-6 hours': 5.5, '6-7 hours': 6.5, '7-8 hours': 7.5, 'More than 8 hours': 9.0, '2-3 hours': 2.5, '3-4 hours': 3.5, '4-5 hours': 4.5, '4-6 hours': 5.0}
    train = train[train['Sleep Duration'].isin(dict_sleep.keys())]
    train['Sleep Duration'] = train['Sleep Duration'].map(dict_sleep)
    train['CGPA'] = train['CGPA'].fillna(train['CGPA'].mean())
    train = train.dropna()
    return train.index




In [103]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
X_train = train.drop('Depression', axis=1)
X_train_index = preprocess_data_index(X_train)
X_train = X_train.loc[X_train_index]
y_train = train['Depression']
len_train = len(X_train)
len_test = len(test)
print(len_train)
print(len_test)
X = pd.concat([X_train, test], axis=0)
print(len(X))
X = preprocess_data(X)
print(len(X))

X_train = X[:len_train]
X_test = X[len_train:]
X_test = X_test.fillna(X_test.mean())


train = pd.concat([X_train, y_train], axis=1)
train = train.dropna()
X_train = train.drop('Depression', axis=1)
y_train = train['Depression']



131707
93800
225507
225507


In [94]:
# We do a min max scaling of the data for the columns that are not one hot encoded
scaler = MinMaxScaler()
columns = ['Age', 'CGPA', 'Pressure', 'Satisfaction', 'Sleep Duration']
X_train[columns] = scaler.fit_transform(X_train[columns])

In [95]:


X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

model = catboost = CatBoostClassifier(iterations=200, depth=5, learning_rate=0.1, loss_function='Logloss', verbose=100, l2_leaf_reg= 1)

model.fit(X_train2, y_train2)

y_pred = model.predict(X_val)

print('Accuracy:', accuracy_score(y_val, y_pred))

0:	learn: 0.5346818	total: 43.1ms	remaining: 8.58s
100:	learn: 0.1394388	total: 799ms	remaining: 783ms
199:	learn: 0.1352191	total: 1.54s	remaining: 0us
Accuracy: 0.9426011692354415


In [106]:
# XGBClassifier :  {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8}
xgb = XGBClassifier(colsample_bytree=1.0, learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)
# catboost {'depth': 4, 'iterations': 200, 'l2_leaf_reg': 5, 'learning_rate': 0.2}
catboost = CatBoostClassifier(iterations=200, depth=4, learning_rate=0.2, loss_function='Logloss', verbose=100, l2_leaf_reg= 5)
# GradientBoostingClassifier:  {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8}
gb = GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)
# Best parameters for LogisticRegression:  {'C': 100, 'l1_ratio': None, 'max_iter': 10000, 'penalty': 'l1', 'solver': 'liblinear'}
lr = LogisticRegression(C=100, l1_ratio=None, max_iter=10000, penalty='l1', solver='liblinear')

estimators = [('xgb', xgb), ('catboost', catboost), ('gb', gb), ('lr', lr)]


vote = VotingClassifier(estimators=estimators, voting='hard')

# vote.fit(X_train2, y_train2)

# y_pred = vote.predict(X_val)


# print('Accuracy:', accuracy_score(y_val, y_pred))

In [98]:
X_trainM = X_train[X_train['Gender'] == 1]
X_trainF = X_train[X_train['Gender'] == 0]
y_trainM = y_train[X_trainM.index]
y_trainF = y_train[X_trainF.index]
model = CatBoostClassifier(iterations=200, depth=4, learning_rate=0.2, loss_function='Logloss', verbose=100, l2_leaf_reg= 5)
X_trainM2, X_valM, y_trainM2, y_valM = train_test_split(X_trainM, y_trainM, test_size=0.2, random_state=42)
X_trainF2, X_valF, y_trainF2, y_valF = train_test_split(X_trainF, y_trainF, test_size=0.2, random_state=42)
X_trainM2 = pd.concat([X_trainM2, X_trainF])
y_trainM2 = pd.concat([y_trainM2, y_trainF])
X_trainF2 = pd.concat([X_trainF2, X_trainM])
y_trainF2 = pd.concat([y_trainF2, y_trainM])
model.fit(X_trainM2, y_trainM2)
y_predM = model.predict(X_valM)
print('Accuracy for male:', accuracy_score(y_valM, y_predM))
model.fit(X_trainF2, y_trainF2)
y_predF = model.predict(X_valF)
print('Accuracy for female:', accuracy_score(y_valF, y_predF))

# concatenate the predictions
y_pred = np.concatenate([y_predM, y_predF])
y_val = np.concatenate([y_valM, y_valF])

print('Accuracy combined:', accuracy_score(y_val, y_pred))

X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

model  = CatBoostClassifier(iterations=200, depth=5, learning_rate=0.1, loss_function='Logloss', verbose=100, l2_leaf_reg= 1)

model.fit(X_train2, y_train2)

y_pred = model.predict(X_val)

print('Accuracy:', accuracy_score(y_val, y_pred))

0:	learn: 0.4184337	total: 18.5ms	remaining: 3.68s
100:	learn: 0.1392964	total: 743ms	remaining: 729ms
199:	learn: 0.1358195	total: 1.45s	remaining: 0us
Accuracy for male: 0.9435511696915326
0:	learn: 0.4175761	total: 16.9ms	remaining: 3.36s
100:	learn: 0.1385169	total: 715ms	remaining: 701ms
199:	learn: 0.1351267	total: 1.42s	remaining: 0us
Accuracy for female: 0.9408488735127837
Accuracy combined: 0.9423354339078278
0:	learn: 0.5346818	total: 12.2ms	remaining: 2.42s
100:	learn: 0.1394388	total: 732ms	remaining: 717ms
199:	learn: 0.1352191	total: 1.44s	remaining: 0us
Accuracy: 0.9426011692354415


In [64]:
X_trainM = X_train[X_train['Gender'] == 1]
X_trainF = X_train[X_train['Gender'] == 0]
y_trainM = y_train[X_trainM.index]
y_trainF = y_train[X_trainF.index]
model =GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)
X_trainM2, X_valM, y_trainM2, y_valM = train_test_split(X_trainM, y_trainM, test_size=0.2, random_state=42)
X_trainF2, X_valF, y_trainF2, y_valF = train_test_split(X_trainF, y_trainF, test_size=0.2, random_state=42)
X_trainM2 = pd.concat([X_trainM2, X_trainF])
y_trainM2 = pd.concat([y_trainM2, y_trainF])
X_trainF2 = pd.concat([X_trainF2, X_trainM])
y_trainF2 = pd.concat([y_trainF2, y_trainM])
model.fit(X_trainM2, y_trainM2)
y_predM = model.predict(X_valM)
print('Accuracy for male:', accuracy_score(y_valM, y_predM))
model.fit(X_trainF2, y_trainF2)
y_predF = model.predict(X_valF)
print('Accuracy for female:', accuracy_score(y_valF, y_predF))

# concatenate the predictions
y_pred = np.concatenate([y_predM, y_predF])
y_val = np.concatenate([y_valM, y_valF])

print('Accuracy combined:', accuracy_score(y_val, y_pred))

X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

model =GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)

model.fit(X_train2, y_train2)

y_pred = model.predict(X_val)

print('Accuracy:', accuracy_score(y_val, y_pred))

Accuracy for male: 0.943206127941481
Accuracy for female: 0.9395831575394481
Accuracy combined: 0.9415761901146458
Accuracy: 0.9423354339078278


In [65]:
X_trainM = X_train[X_train['Gender'] == 1]
X_trainF = X_train[X_train['Gender'] == 0]
y_trainM = y_train[X_trainM.index]
y_trainF = y_train[X_trainF.index]
model = XGBClassifier(colsample_bytree=1.0, learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)
X_trainM2, X_valM, y_trainM2, y_valM = train_test_split(X_trainM, y_trainM, test_size=0.2, random_state=42)
X_trainF2, X_valF, y_trainF2, y_valF = train_test_split(X_trainF, y_trainF, test_size=0.2, random_state=42)
X_trainM2 = pd.concat([X_trainM2, X_trainF])
y_trainM2 = pd.concat([y_trainM2, y_trainF])
X_trainF2 = pd.concat([X_trainF2, X_trainM])
y_trainF2 = pd.concat([y_trainF2, y_trainM])
model.fit(X_trainM2, y_trainM2)
y_predM = model.predict(X_valM)
print('Accuracy for male:', accuracy_score(y_valM, y_predM))
model.fit(X_trainF2, y_trainF2)
y_predF = model.predict(X_valF)
print('Accuracy for female:', accuracy_score(y_valF, y_predF))

# concatenate the predictions
y_pred = np.concatenate([y_predM, y_predF])
y_val = np.concatenate([y_valM, y_valF])

print('Accuracy combined:', accuracy_score(y_val, y_pred))

X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

model = XGBClassifier(colsample_bytree=1.0, learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)

model.fit(X_train2, y_train2)

y_pred = model.predict(X_val)   

print('Accuracy:', accuracy_score(y_val, y_pred))

Accuracy for male: 0.9440342281416051
Accuracy for female: 0.9411863977723399
Accuracy combined: 0.9427530179940778
Accuracy: 0.9421835851491914


In [102]:
X_trainS = X_train[X_train['Profession_Student'] == 1]
X_trainP = X_train[X_train['Profession_Student'] == 0]
X_trainS = X_trainS.drop(['Profession_Student'], axis=1)
X_trainP = X_trainP.drop(['Profession_Student'], axis=1)
y_trainS = y_train[X_trainS.index]
y_trainP = y_train[X_trainP.index]

X_trainS2, X_valS, y_trainS2, y_valS = train_test_split(X_trainS, y_trainS, test_size=0.2, random_state=42)
X_trainS2 = pd.concat([X_trainS2, X_trainP], axis=0)
y_trainS2 = pd.concat([y_trainS2, y_trainP], axis=0)
X_trainP2, X_valP, y_trainP2, y_valP = train_test_split(X_trainP, y_trainP, test_size=0.2, random_state=42)
X_trainP2 = pd.concat([X_trainP2, X_trainS], axis=0)
y_trainP2 = pd.concat([y_trainP2, y_trainS], axis=0)

model = XGBClassifier(colsample_bytree=1.0, learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)


model.fit(X_trainS, y_trainS)
y_predS = model.predict(X_valS)
print('Accuracy for Students:', accuracy_score(y_valS, y_predS))
model.fit(X_trainP, y_trainP)
y_predP = model.predict(X_valP)
print('Accuracy for Pro:', accuracy_score(y_valP, y_predP))

# concatenate the predictions
y_pred = np.concatenate([y_predS, y_predP])
y_val = np.concatenate([y_valS, y_valP])
print('Accuracy combined:', accuracy_score(y_val, y_pred))

X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
model.fit(X_train2, y_train2)
y_pred = model.predict(X_val)
print('Accuracy:', accuracy_score(y_val, y_pred))

Accuracy for Students: 0.8573994252873564
Accuracy for Pro: 0.9688071627996534
Accuracy combined: 0.9452585225115785
Accuracy: 0.9421835851491914


In [109]:
X_trainS = X_train[X_train['Profession_Student'] == 1]
X_trainP = X_train[X_train['Profession_Student'] == 0]
X_trainS = X_trainS.drop(['Profession_Student'], axis=1)
X_trainP = X_trainP.drop(['Profession_Student'], axis=1)
y_trainS = y_train[X_trainS.index]
y_trainP = y_train[X_trainP.index]

X_trainS2, X_valS, y_trainS2, y_valS = train_test_split(X_trainS, y_trainS, test_size=0.2, random_state=42)
X_trainS2 = pd.concat([X_trainS2, X_trainP], axis=0)
y_trainS2 = pd.concat([y_trainS2, y_trainP], axis=0)
X_trainP2, X_valP, y_trainP2, y_valP = train_test_split(X_trainP, y_trainP, test_size=0.2, random_state=42)
X_trainP2 = pd.concat([X_trainP2, X_trainS], axis=0)
y_trainP2 = pd.concat([y_trainP2, y_trainS], axis=0)



print("Hard Voting")
# model =GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)
vote = VotingClassifier(estimators=estimators, voting='hard')




vote.fit(X_trainS, y_trainS)
y_predS = vote.predict(X_valS)
print('Accuracy for Students:', accuracy_score(y_valS, y_predS))
vote.fit(X_trainP, y_trainP)
y_predP = vote.predict(X_valP)
print('Accuracy for Pro:', accuracy_score(y_valP, y_predP))

# concatenate the predictions
y_pred = np.concatenate([y_predS, y_predP])
y_val = np.concatenate([y_valS, y_valP])
print('Accuracy combined:', accuracy_score(y_val, y_pred))

print("Soft Voting")
# model =GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)
vote = VotingClassifier(estimators=estimators, voting='soft')




vote.fit(X_trainS, y_trainS)
y_predS = vote.predict(X_valS)
print('Accuracy for Students:', accuracy_score(y_valS, y_predS))
vote.fit(X_trainP, y_trainP)
y_predP = vote.predict(X_valP)
print('Accuracy for Pro:', accuracy_score(y_valP, y_predP))

# concatenate the predictions
y_pred = np.concatenate([y_predS, y_predP])
y_val = np.concatenate([y_valS, y_valP])
print('Accuracy combined:', accuracy_score(y_val, y_pred))


print("stacking")
# model =GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)

stack = StackingClassifier(estimators=estimators, final_estimator=GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8))



stack.fit(X_trainS, y_trainS)
y_predS = stack.predict(X_valS)
print('Accuracy for Students:', accuracy_score(y_valS, y_predS))
stack.fit(X_trainP, y_trainP)
y_predP = stack.predict(X_valP)
print('Accuracy for Pro:', accuracy_score(y_valP, y_predP))

# concatenate the predictions
y_pred = np.concatenate([y_predS, y_predP])
y_val = np.concatenate([y_valS, y_valP])
print('Accuracy combined:', accuracy_score(y_val, y_pred))

print("stacking")
# model =GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)

stack = StackingClassifier(estimators=estimators, final_estimator= LogisticRegression(C=100, l1_ratio=None, max_iter=10000, penalty='l1', solver='liblinear'))



stack.fit(X_trainS, y_trainS)
y_predS = stack.predict(X_valS)
print('Accuracy for Students:', accuracy_score(y_valS, y_predS))
stack.fit(X_trainP, y_trainP)
y_predP = stack.predict(X_valP)
print('Accuracy for Pro:', accuracy_score(y_valP, y_predP))

# concatenate the predictions
y_pred = np.concatenate([y_predS, y_predP])
y_val = np.concatenate([y_valS, y_valP])
print('Accuracy combined:', accuracy_score(y_val, y_pred))


Hard Voting
0:	learn: 0.5748513	total: 3.59ms	remaining: 715ms
100:	learn: 0.3347029	total: 290ms	remaining: 285ms
199:	learn: 0.3217873	total: 564ms	remaining: 0us
Accuracy for Students: 0.8563218390804598
0:	learn: 0.3613511	total: 7.84ms	remaining: 1.56s
100:	learn: 0.0830565	total: 635ms	remaining: 623ms
199:	learn: 0.0802820	total: 1.27s	remaining: 0us
Accuracy for Pro: 0.9682295176663137
Accuracy combined: 0.9445752030977147
Soft Voting
0:	learn: 0.5748513	total: 2.35ms	remaining: 467ms
100:	learn: 0.3347029	total: 251ms	remaining: 246ms
199:	learn: 0.3217873	total: 498ms	remaining: 0us
Accuracy for Students: 0.8561422413793104
0:	learn: 0.3613511	total: 5.28ms	remaining: 1.05s
100:	learn: 0.0830565	total: 649ms	remaining: 637ms
199:	learn: 0.0802820	total: 1.28s	remaining: 0us
Accuracy for Pro: 0.9683257918552036
Accuracy combined: 0.9446131652873738
stacking
0:	learn: 0.5748513	total: 3.94ms	remaining: 785ms
100:	learn: 0.3347029	total: 267ms	remaining: 262ms
199:	learn: 0.3217

In [104]:
X_trainS = X_train[X_train['Profession_Student'] == 1]
X_trainP = X_train[X_train['Profession_Student'] == 0]
X_trainS = X_trainS.drop(['Profession_Student'], axis=1)
X_trainP = X_trainP.drop(['Profession_Student'], axis=1)
y_trainS = y_train[X_trainS.index]
y_trainP = y_train[X_trainP.index]

X_trainS2, X_valS, y_trainS2, y_valS = train_test_split(X_trainS, y_trainS, test_size=0.2, random_state=42)
X_trainS2 = pd.concat([X_trainS2, X_trainP], axis=0)
y_trainS2 = pd.concat([y_trainS2, y_trainP], axis=0)
X_trainP2, X_valP, y_trainP2, y_valP = train_test_split(X_trainP, y_trainP, test_size=0.2, random_state=42)
X_trainP2 = pd.concat([X_trainP2, X_trainS], axis=0)
y_trainP2 = pd.concat([y_trainP2, y_trainS], axis=0)




model =GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)




model.fit(X_trainS, y_trainS)
y_predS = model.predict(X_valS)
print('Accuracy for Students:', accuracy_score(y_valS, y_predS))
model.fit(X_trainP, y_trainP)
y_predP = model.predict(X_valP)
print('Accuracy for Pro:', accuracy_score(y_valP, y_predP))

# concatenate the predictions
y_pred = np.concatenate([y_predS, y_predP])
y_val = np.concatenate([y_valS, y_valP])
print('Accuracy combined:', accuracy_score(y_val, y_pred))

X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
model.fit(X_train2, y_train2)
y_pred = model.predict(X_val)
print('Accuracy:', accuracy_score(y_val, y_pred))

Accuracy for Students: 0.8579382183908046
Accuracy for Pro: 0.9690478482718783
Accuracy combined: 0.9455622200288513
Accuracy: 0.9425252448561233


In [100]:
X_trainS = X_train[X_train['Profession_Student'] == 1]
X_trainP = X_train[X_train['Profession_Student'] == 0]
X_trainS = X_trainS.drop(['Profession_Student'], axis=1)
X_trainP = X_trainP.drop(['Profession_Student'], axis=1)
y_trainS = y_train[X_trainS.index]
y_trainP = y_train[X_trainP.index]

X_trainS2, X_valS, y_trainS2, y_valS = train_test_split(X_trainS, y_trainS, test_size=0.2, random_state=42)
X_trainS2 = pd.concat([X_trainS2, X_trainP], axis=0)
y_trainS2 = pd.concat([y_trainS2, y_trainP], axis=0)
X_trainP2, X_valP, y_trainP2, y_valP = train_test_split(X_trainP, y_trainP, test_size=0.2, random_state=42)
X_trainP2 = pd.concat([X_trainP2, X_trainS], axis=0)
y_trainP2 = pd.concat([y_trainP2, y_trainS], axis=0)



model = CatBoostClassifier(iterations=200, depth=4, learning_rate=0.2, loss_function='Logloss', verbose=100, l2_leaf_reg= 5)




model.fit(X_trainS, y_trainS)
y_predS = model.predict(X_valS)
print('Accuracy for Students:', accuracy_score(y_valS, y_predS))
model.fit(X_trainP, y_trainP)
y_predP = model.predict(X_valP)
print('Accuracy for Pro:', accuracy_score(y_valP, y_predP))

# concatenate the predictions
y_pred = np.concatenate([y_predS, y_predP])
y_val = np.concatenate([y_valS, y_valP])
print('Accuracy combined:', accuracy_score(y_val, y_pred))

X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
model.fit(X_train2, y_train2)
y_pred = model.predict(X_val)
print('Accuracy:', accuracy_score(y_val, y_pred))

0:	learn: 0.5748513	total: 3.96ms	remaining: 789ms
100:	learn: 0.3347029	total: 250ms	remaining: 245ms
199:	learn: 0.3217873	total: 511ms	remaining: 0us
Accuracy for Students: 0.8611709770114943
0:	learn: 0.3613511	total: 7.84ms	remaining: 1.56s
100:	learn: 0.0830565	total: 748ms	remaining: 733ms
199:	learn: 0.0802820	total: 1.42s	remaining: 0us
Accuracy for Pro: 0.9689997111774333
Accuracy combined: 0.9462075772530559
0:	learn: 0.4206962	total: 7.31ms	remaining: 1.45s
100:	learn: 0.1379309	total: 646ms	remaining: 633ms
199:	learn: 0.1341230	total: 1.28s	remaining: 0us
Accuracy: 0.9425632070457824


In [69]:
testS = X_test[X_test['Profession_Student'] == 1]
testP = X_test[X_test['Profession_Student'] == 0]
testS = testS.drop(['Profession_Student'], axis=1)
testP = testP.drop(['Profession_Student'], axis=1)

X_trainS = X_train[X_train['Profession_Student'] == 1]
X_trainP = X_train[X_train['Profession_Student'] == 0]
X_trainS = X_trainS.drop(['Profession_Student'], axis=1)
X_trainP = X_trainP.drop(['Profession_Student'], axis=1)
y_trainS = y_train[X_trainS.index]
y_trainP = y_train[X_trainP.index]

model = CatBoostClassifier(iterations=200, depth=4, learning_rate=0.2, loss_function='Logloss', verbose=100, l2_leaf_reg= 5)

model.fit(X_trainS, y_trainS)

y_predS = model.predict(testS)

model.fit(X_trainP, y_trainP)

y_predP = model.predict(testP)

# concatenate the predictions in the original order

y_pred = np.zeros(len(X_test))
y_pred[testS.index] = y_predS
y_pred[testP.index] = y_predP

submission = pd.DataFrame({'id': test['id'], 'Depression': y_pred})
submission.to_csv('submission.csv', index=False)




0:	learn: 0.5748513	total: 2.54ms	remaining: 507ms
100:	learn: 0.3347029	total: 273ms	remaining: 267ms
199:	learn: 0.3217873	total: 542ms	remaining: 0us
0:	learn: 0.3613511	total: 7.78ms	remaining: 1.55s
100:	learn: 0.0830565	total: 628ms	remaining: 615ms
199:	learn: 0.0802820	total: 1.24s	remaining: 0us


In [70]:
testS = X_test[X_test['Profession_Student'] == 1]
testP = X_test[X_test['Profession_Student'] == 0]
testS = testS.drop(['Profession_Student'], axis=1)
testP = testP.drop(['Profession_Student'], axis=1)

X_trainS = X_train[X_train['Profession_Student'] == 1]
X_trainP = X_train[X_train['Profession_Student'] == 0]
X_trainS = X_trainS.drop(['Profession_Student'], axis=1)
X_trainP = X_trainP.drop(['Profession_Student'], axis=1)
y_trainS = y_train[X_trainS.index]
y_trainP = y_train[X_trainP.index]

model = GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)

model.fit(X_trainS, y_trainS)

y_predS = model.predict(testS)

model.fit(X_trainP, y_trainP)

y_predP = model.predict(testP)

# concatenate the predictions in the original order

y_pred = np.zeros(len(X_test))
y_pred[testS.index] = y_predS
y_pred[testP.index] = y_predP

submission = pd.DataFrame({'id': test['id'], 'Depression': y_pred})
submission.to_csv('submission.csv', index=False)




In [72]:
sub1 = pd.read_csv('sumissions/submissionGB.csv')
sub2 = pd.read_csv('sumissions/submissionCTB.csv')

In [75]:
sub11 = sub1[~sub1.isin(sub2).all(axis=1)]
sub22 = sub2[~sub2.isin(sub1).all(axis=1)]

In [79]:
different_id = sub11['id']
test_diff = test[test['id'].isin(different_id)]
index_diff = test_diff.index
X_test_diff = X_test.loc[index_diff]

In [83]:
X_test_diffS = X_test_diff[X_test_diff['Profession_Student'] == 1]
X_test_diffP = X_test_diff[X_test_diff['Profession_Student'] == 0]
X_test_diffS = X_test_diffS.loc[:, (X_test_diffS != 0).any()]
X_test_diffP = X_test_diffP.loc[:, (X_test_diffP != 0).any()]

In [84]:
X_test_diffS.describe()

Unnamed: 0,Gender,Age,CGPA,Sleep Duration,Dietary Habits,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Pressure,...,Degree_M.Tech,Degree_MA,Degree_MBA,Degree_MBBS,Degree_MCA,Degree_MD,Degree_ME,Degree_MHM,Degree_MSc,Degree_PhD
count,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,...,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0
mean,0.491525,26.208232,7.59954,6.466102,0.96368,0.644068,6.719128,2.762712,0.479419,2.874092,...,0.029056,0.014528,0.01937,0.03632,0.041162,0.026634,0.009685,0.009685,0.029056,0.021792
std,0.500535,4.898011,1.537767,1.897575,0.801175,0.479376,3.826142,1.380117,0.500182,1.334251,...,0.168166,0.119798,0.13799,0.187311,0.198906,0.161208,0.098055,0.098055,0.168166,0.14618
min,0.0,18.0,5.03,4.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,22.0,6.16,4.0,0.0,0.0,3.0,2.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,27.0,7.53,7.5,1.0,1.0,7.0,3.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,30.0,8.91,7.5,2.0,1.0,10.0,4.0,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,34.0,10.0,9.0,2.0,1.0,12.0,5.0,1.0,5.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [85]:
X_test_diffP.describe()

Unnamed: 0,Gender,Age,CGPA,Sleep Duration,Dietary Habits,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Pressure,...,Degree_M.Tech,Degree_MA,Degree_MBA,Degree_MBBS,Degree_MCA,Degree_MD,Degree_ME,Degree_MHM,Degree_MSc,Degree_PhD
count,519.0,519.0,519.0,519.0,519.0,519.0,519.0,519.0,519.0,519.0,...,519.0,519.0,519.0,519.0,519.0,519.0,519.0,519.0,519.0,519.0
mean,0.576108,25.639692,7.665525,6.136802,0.751445,0.77264,7.262042,3.678227,0.499037,3.666825,...,0.007707,0.023121,0.015414,0.013487,0.011561,0.011561,0.023121,0.009634,0.017341,0.019268
std,0.49465,7.635219,0.009433,1.974884,0.744266,0.419532,3.595998,1.34657,0.500481,1.296883,...,0.087536,0.150434,0.123312,0.115461,0.107,0.107,0.150434,0.097773,0.130665,0.137597
min,0.0,18.0,7.66511,4.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,19.0,7.66511,4.0,0.0,1.0,4.0,3.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,23.0,7.66511,5.5,1.0,1.0,8.0,4.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,30.0,7.66511,7.5,1.0,1.0,10.0,5.0,1.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,50.0,7.88,9.0,2.0,1.0,12.0,5.0,1.0,5.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
