In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import sys

In [88]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [89]:
def preprocess_data(train):
    train = train.drop(['id', 'Name'], axis=1)
    train['Pressure'] = train[['Work Pressure', 'Academic Pressure']].max(axis=1)
    train = train.drop(['Work Pressure', 'Academic Pressure'], axis=1)
    # encode gender in 1 and 0 (1 for male and 0 for Female)
    train['Gender'] = (train['Gender'] == 'Male').astype(int)
    # For Working Status (Student = 0, Working Professional = 1)
    # train['Working Professional or Student'] = (train['Working Professional or Student'] == 'Working Professional').astype(int)
    train.loc[train['Working Professional or Student'] == 'Student', 'Profession'] = 'Student'
    train['Satisfaction'] = train[['Study Satisfaction', 'Job Satisfaction']].max(axis=1)
    train = train.drop(['Study Satisfaction', 'Job Satisfaction'], axis=1)
    train['Family History of Mental Illness'] = (train['Family History of Mental Illness'] == 'Yes').astype(int)
    train['Have you ever had suicidal thoughts ?'] = (train['Have you ever had suicidal thoughts ?'] == 'Yes').astype(int)
    # we can either drop City or encode it in one hot encoding
    # one hot encoding
    #train = pd.get_dummies(train, columns=['City']).astype(int)
    # drop city
    train = train.drop(['City'], axis=1)
    diet_mapping = {'Moderate': 1.0, 'Unhealthy': 0.0, 'Healthy': 2.0}
    train = train[train['Dietary Habits'].isin(diet_mapping.keys())]
    train['Dietary Habits'] = train['Dietary Habits'].map(diet_mapping)
    v = train["Profession"].value_counts() 
    # keep only the profession with more than 10 samples
    train = train[train['Profession'].isin(v.index[v.gt(10)])]
    # one hot encoding
    train = pd.get_dummies(train, columns=['Profession'])
    profession_cols = [col for col in train.columns if col.startswith('Profession_')]
    train[profession_cols] = train[profession_cols].astype(int)
    train = train.drop(['Working Professional or Student'], axis=1)
    v = train["Degree"].value_counts() 
    train = train[train['Degree'].isin(v.index[v.gt(10)])]
    # one hot encoding
    train = pd.get_dummies(train, columns=['Degree'])
    degree_cols = [col for col in train.columns if col.startswith('Degree_')]
    train[degree_cols] = train[degree_cols].astype(int)
    dict_sleep = {'Less than 5 hours': 4.0, '5-6 hours': 5.5, '6-7 hours': 6.5, '7-8 hours': 7.5, 'More than 8 hours': 9.0, '2-3 hours': 2.5, '3-4 hours': 3.5, '4-5 hours': 4.5, '4-6 hours': 5.0}
    train = train[train['Sleep Duration'].isin(dict_sleep.keys())]
    train['Sleep Duration'] = train['Sleep Duration'].map(dict_sleep)
    train['CGPA'] = train['CGPA'].fillna(train['CGPA'].mean())
    train = train.dropna()
    return train


In [86]:
# train.to_csv('train_cleaned.csv', index=False)

In [79]:
train = pd.read_csv('train_cleaned.csv')

Sleep Duration
Less than 5 hours    35829
7-8 hours            34867
More than 8 hours    30791
5-6 hours            30197
3-4 hours               11
6-7 hours                8
4-5 hours                7
4-6 hours                5
2-3 hours                5
No                       4
1-6 hours                4
6-8 hours                4
Sleep_Duration           2
Unhealthy                2
45                       2
8-9 hours                2
9-11 hours               2
10-11 hours              2
10-6 hours               1
Pune                     1
9-5                      1
45-48 hours              1
3-6 hours                1
Work_Study_Hours         1
than 5 hours             1
Moderate                 1
55-66 hours              1
8 hours                  1
35-36 hours              1
40-45 hours              1
Indore                   1
1-3 hours                1
9-6 hours                1
1-2 hours                1
9-5 hours                1
Name: count, dtype: int64

In [90]:
X_train = train.drop(['Depression'], axis=1)
y_train = train['Depression']