In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import sys
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from catboost import CatBoostClassifier


In [3]:
def preprocess_data_index(train):
    train = train.drop(['id', 'Name'], axis=1)
    train['Pressure'] = train[['Work Pressure', 'Academic Pressure']].max(axis=1)
    train = train.drop(['Work Pressure', 'Academic Pressure'], axis=1)
    # encode gender in 1 and 0 (1 for male and 0 for Female)
    train['Gender'] = (train['Gender'] == 'Male').astype(int)
    # For Working Status (Student = 0, Working Professional = 1)
    # train['Working Professional or Student'] = (train['Working Professional or Student'] == 'Working Professional').astype(int)
    train.loc[train['Working Professional or Student'] == 'Student', 'Profession'] = 'Student'
    train['Satisfaction'] = train[['Study Satisfaction', 'Job Satisfaction']].max(axis=1)
    train = train.drop(['Study Satisfaction', 'Job Satisfaction'], axis=1)
    train['Family History of Mental Illness'] = (train['Family History of Mental Illness'] == 'Yes').astype(int)
    train['Have you ever had suicidal thoughts ?'] = (train['Have you ever had suicidal thoughts ?'] == 'Yes').astype(int)
    # we can either drop City or encode it in one hot encoding
    # one hot encoding
    #train = pd.get_dummies(train, columns=['City']).astype(int)
    # drop city
    train = train.drop(['City'], axis=1)
    diet_mapping = {'Moderate': 1.0, 'Unhealthy': 0.0, 'Healthy': 2.0}
    train = train[train['Dietary Habits'].isin(diet_mapping.keys())]
    train['Dietary Habits'] = train['Dietary Habits'].map(diet_mapping)
    v = train["Profession"].value_counts() 
    # keep only the profession with more than 10 samples
    train = train[train['Profession'].isin(v.index[v.gt(10)])]
    # one hot encoding
    train = pd.get_dummies(train, columns=['Profession'])
    profession_cols = [col for col in train.columns if col.startswith('Profession_')]
    train[profession_cols] = train[profession_cols].astype(int)
    train = train.drop(['Working Professional or Student'], axis=1)
    v = train["Degree"].value_counts() 
    train = train[train['Degree'].isin(v.index[v.gt(10)])]
    # one hot encoding
    train = pd.get_dummies(train, columns=['Degree'])
    degree_cols = [col for col in train.columns if col.startswith('Degree_')]
    train[degree_cols] = train[degree_cols].astype(int)
    dict_sleep = {'Less than 5 hours': 4.0, '5-6 hours': 5.5, '6-7 hours': 6.5, '7-8 hours': 7.5, 'More than 8 hours': 9.0, '2-3 hours': 2.5, '3-4 hours': 3.5, '4-5 hours': 4.5, '4-6 hours': 5.0}
    train = train[train['Sleep Duration'].isin(dict_sleep.keys())]
    train['Sleep Duration'] = train['Sleep Duration'].map(dict_sleep)
    train['CGPA'] = train['CGPA'].fillna(train['CGPA'].mean())
    train = train.dropna()
    return train.index




In [4]:
def preprocess_data(train):
    train = train.drop(['id', 'Name'], axis=1)
    train['Pressure'] = train[['Work Pressure', 'Academic Pressure']].max(axis=1)
    train = train.drop(['Work Pressure', 'Academic Pressure'], axis=1)
    # encode gender in 1 and 0 (1 for male and 0 for Female)
    train['Gender'] = (train['Gender'] == 'Male').astype(int)
    # For Working Status (Student = 0, Working Professional = 1)
    # train['Working Professional or Student'] = (train['Working Professional or Student'] == 'Working Professional').astype(int)
    train.loc[train['Working Professional or Student'] == 'Student', 'Profession'] = 'Student'
    train['Satisfaction'] = train[['Study Satisfaction', 'Job Satisfaction']].max(axis=1)
    train = train.drop(['Study Satisfaction', 'Job Satisfaction'], axis=1)
    train['Family History of Mental Illness'] = (train['Family History of Mental Illness'] == 'Yes').astype(int)
    train['Have you ever had suicidal thoughts ?'] = (train['Have you ever had suicidal thoughts ?'] == 'Yes').astype(int)
    # we can either drop City or encode it in one hot encoding
    # one hot encoding
    #train = pd.get_dummies(train, columns=['City']).astype(int)
    # drop city
    train = train.drop(['City'], axis=1)
    diet_mapping = {'Moderate': 1.0, 'Unhealthy': 0.0, 'Healthy': 2.0}
    #todo train = train[train['Dietary Habits'].isin(diet_mapping.keys())]
    train['Dietary Habits'] = train['Dietary Habits'].map(diet_mapping)
    v = train["Profession"].value_counts() 
    # keep only the profession with more than 10 samples
    #todo train = train[train['Profession'].isin(v.index[v.gt(10)])]
    # one hot encoding
    train = pd.get_dummies(train, columns=['Profession'])
    profession_cols = [col for col in train.columns if col.startswith('Profession_')]
    train[profession_cols] = train[profession_cols].astype(int)
    train = train.drop(['Working Professional or Student'], axis=1)
    v = train["Degree"].value_counts() 
    #todo train = train[train['Degree'].isin(v.index[v.gt(10)])]
    # one hot encoding
    train = pd.get_dummies(train, columns=['Degree'])
    degree_cols = [col for col in train.columns if col.startswith('Degree_')]
    train[degree_cols] = train[degree_cols].astype(int)
    dict_sleep = {'Less than 5 hours': 4.0, '5-6 hours': 5.5, '6-7 hours': 6.5, '7-8 hours': 7.5, 'More than 8 hours': 9.0, '2-3 hours': 2.5, '3-4 hours': 3.5, '4-5 hours': 4.5, '4-6 hours': 5.0}
    #todo train = train[train['Sleep Duration'].isin(dict_sleep.keys())]
    train['Sleep Duration'] = train['Sleep Duration'].map(dict_sleep)
    train['CGPA'] = train['CGPA'].fillna(train['CGPA'].mean())
    #train = train.dropna()
    return train




In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
X_train = train.drop('Depression', axis=1)
X_train_index = preprocess_data_index(X_train)
X_train = X_train.loc[X_train_index]
y_train = train['Depression']
len_train = len(X_train)
len_test = len(test)
print(len_train)
print(len_test)
X = pd.concat([X_train, test], axis=0)
print(len(X))
X = preprocess_data(X)
print(len(X))

X_train = X[:len_train]
X_test = X[len_train:]
X_test = X_test.fillna(X_test.mean())


train = pd.concat([X_train, y_train], axis=1)
train = train.dropna()
X_train = train.drop('Depression', axis=1)
y_train = train['Depression']


131707
93800
225507
225507


In [16]:
X_trainS = X_train[X_train['Profession_Student'] == 1]
X_trainP = X_train[X_train['Profession_Student'] == 0]
X_trainS = X_trainS.drop(['Profession_Student'], axis=1)
X_trainP = X_trainP.drop(['Profession_Student'], axis=1)
y_trainS = y_train[X_trainS.index]
y_trainP = y_train[X_trainP.index]

X_trainS2, X_valS, y_trainS2, y_valS = train_test_split(X_trainS, y_trainS, test_size=0.2, random_state=42)
X_trainS2 = pd.concat([X_trainS2, X_trainP], axis=0)
y_trainS2 = pd.concat([y_trainS2, y_trainP], axis=0)
X_trainP2, X_valP, y_trainP2, y_valP = train_test_split(X_trainP, y_trainP, test_size=0.2, random_state=42)





model =GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)




model.fit(X_trainS, y_trainS)
y_predS = model.predict(X_valS)
print('Accuracy for Students:', accuracy_score(y_valS, y_predS))
model.fit(X_trainP, y_trainP)
y_predP = model.predict(X_valP)
print('Accuracy for Pro:', accuracy_score(y_valP, y_predP))

# concatenate the predictions
y_pred = np.concatenate([y_predS, y_predP])
y_val = np.concatenate([y_valS, y_valP])
print('Accuracy combined:', accuracy_score(y_val, y_pred))

X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
model.fit(X_train2, y_train2)
y_pred = model.predict(X_val)
print('Accuracy:', accuracy_score(y_val, y_pred))

Accuracy for Students: 0.8590158045977011
Accuracy for Pro: 0.9689034369885434
Accuracy combined: 0.9456761065978285
Accuracy: 0.942411358287146


In [13]:
X_trainS = X_train[X_train['Profession_Student'] == 1]
X_trainP = X_train[X_train['Profession_Student'] == 0]
X_trainS = X_trainS.drop(['Profession_Student'], axis=1)
X_trainP = X_trainP.drop(['Profession_Student'], axis=1)
y_trainS = y_train[X_trainS.index]
y_trainP = y_train[X_trainP.index]



model = GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)#


scores = cross_val_score(model, X_trainS, y_trainS, cv=5, scoring='accuracy')

print("Student dataset")

print("accuracy: %f " % (scores.mean()))

scores = cross_val_score(model, X_trainP, y_trainP, cv=5, scoring='accuracy')

print("Professional dataset")

print("accuracy: %f " % (scores.mean()))


0:	learn: 0.6277897	total: 2.33ms	remaining: 465ms
100:	learn: 0.3335687	total: 253ms	remaining: 248ms
199:	learn: 0.3182864	total: 518ms	remaining: 0us
0:	learn: 0.6287914	total: 2.43ms	remaining: 483ms
100:	learn: 0.3353161	total: 259ms	remaining: 254ms
199:	learn: 0.3197398	total: 569ms	remaining: 0us
0:	learn: 0.6284655	total: 3.65ms	remaining: 725ms
100:	learn: 0.3333147	total: 363ms	remaining: 356ms
199:	learn: 0.3173156	total: 667ms	remaining: 0us
0:	learn: 0.6278578	total: 2.64ms	remaining: 526ms
100:	learn: 0.3351440	total: 247ms	remaining: 242ms
199:	learn: 0.3193406	total: 498ms	remaining: 0us
0:	learn: 0.6278900	total: 2.24ms	remaining: 447ms
100:	learn: 0.3371656	total: 272ms	remaining: 266ms
199:	learn: 0.3208369	total: 530ms	remaining: 0us
Student dataset
accuracy: 0.847721 
0:	learn: 0.4934335	total: 6.43ms	remaining: 1.28s
100:	learn: 0.0832152	total: 656ms	remaining: 643ms
199:	learn: 0.0792364	total: 1.25s	remaining: 0us
0:	learn: 0.4929996	total: 7.23ms	remaining: 1

In [9]:
X_trainS = X_train[X_train['Profession_Student'] == 1]
X_trainP = X_train[X_train['Profession_Student'] == 0]
X_trainS = X_trainS.drop(['Profession_Student'], axis=1)
X_trainP = X_trainP.drop(['Profession_Student'], axis=1)
y_trainS = y_train[X_trainS.index]
y_trainP = y_train[X_trainP.index]

X_trainS2, X_valS, y_trainS2, y_valS = train_test_split(X_trainS, y_trainS, test_size=0.2, random_state=42)
X_trainS2 = pd.concat([X_trainS2, X_trainP], axis=0)
y_trainS2 = pd.concat([y_trainS2, y_trainP], axis=0)
X_trainP2, X_valP, y_trainP2, y_valP = train_test_split(X_trainP, y_trainP, test_size=0.2, random_state=42)





model =GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)




model.fit(X_trainS, y_trainS)
y_predS = model.predict(X_valS)
print('Accuracy for Students:', accuracy_score(y_valS, y_predS))
model.fit(X_trainP, y_trainP)
y_predP = model.predict(X_valP)
print('Accuracy for Pro:', accuracy_score(y_valP, y_predP))

# concatenate the predictions
y_pred = np.concatenate([y_predS, y_predP])
y_val = np.concatenate([y_valS, y_valP])
print('Accuracy combined:', accuracy_score(y_val, y_pred))

X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
model.fit(X_train2, y_train2)
y_pred = model.predict(X_val)
print('Accuracy:', accuracy_score(y_val, y_pred))

Accuracy for Students: 0.8570402298850575
Accuracy for Pro: 0.9684220660440935
Accuracy combined: 0.9448789006149875
Accuracy: 0.9422595095285096


In [17]:
testS = X_test[X_test['Profession_Student'] == 1]
testP = X_test[X_test['Profession_Student'] == 0]
testS = testS.drop(['Profession_Student'], axis=1)
testP = testP.drop(['Profession_Student'], axis=1)

X_trainS = X_train[X_train['Profession_Student'] == 1]
X_trainP = X_train[X_train['Profession_Student'] == 0]
X_trainS = X_trainS.drop(['Profession_Student'], axis=1)
X_trainP = X_trainP.drop(['Profession_Student'], axis=1)
y_trainS = y_train[X_trainS.index]
y_trainP = y_train[X_trainP.index]

X_train3 = X_train.drop(['Profession_Student'], axis=1)

model = GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)

model.fit(X_train3, y_train)

y_predS = model.predict(testS)

model.fit(X_trainP, y_trainP)

y_predP = model.predict(testP)

# concatenate the predictions in the original order

y_pred = np.zeros(len(X_test))
y_pred[testS.index] = y_predS
y_pred[testP.index] = y_predP

submission = pd.DataFrame({'id': test['id'], 'Depression': y_pred})
submission.to_csv('submission.csv', index=False)


