In [2]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold 
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
from sklearn.linear_model import LinearRegression, LogisticRegression   
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 50)

In [3]:
df = pd.read_csv(r'D:\ds_ridwan\mainan-modular\data\train.csv')
df.drop(columns=['id'], inplace=True)
df.head()

Unnamed: 0,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [9]:
# cek missing values
null_cols = df.columns[df.isnull().any()]
for col in null_cols:
    print(f'Column {col} has {df[col].isnull().sum() / len(df):.3%} missing values')

Column Profession has 26.034% missing values
Column Academic Pressure has 80.173% missing values
Column Work Pressure has 19.842% missing values
Column CGPA has 80.172% missing values
Column Study Satisfaction has 80.173% missing values
Column Job Satisfaction has 19.837% missing values
Column Dietary Habits has 0.003% missing values
Column Degree has 0.001% missing values
Column Financial Stress has 0.003% missing values


In [10]:
# after see the dist of NaN, i decide to drop some cols
col_to_drop = ['Academic Pressure','CGPA','Study Satisfaction']
df.drop(columns=col_to_drop, inplace=True)

# Masuk pipeline

In [11]:
cat_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [12]:
df.nunique()

Name                                     422
Gender                                     2
Age                                       43
City                                      98
Working Professional or Student            2
Profession                                64
Work Pressure                              5
Job Satisfaction                           5
Sleep Duration                            36
Dietary Habits                            23
Degree                                   115
Have you ever had suicidal thoughts ?      2
Work/Study Hours                          13
Financial Stress                           5
Family History of Mental Illness           2
Depression                                 2
dtype: int64

In [13]:
df.Depression.unique()

array([0, 1])

In [15]:
# make list to exclude depression column
col_to_exclude = ['Depression']
cols = [col for col in df.columns if col not in col_to_exclude]

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_pipe, cols)])

In [25]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.1,
        num_leaves=31,
        max_depth=-1,
        random_state=42
    ))
])

# Split into train n test

In [18]:
X, y = df.drop(columns='Depression'), df['Depression']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((105525, 15), (35175, 15), (105525,), (35175,))

In [26]:
model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 19112, number of negative: 86413
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0,007587 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 788
[LightGBM] [Info] Number of data points in the train set: 105525, number of used features: 394
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0,181113 -> initscore=-1,508822
[LightGBM] [Info] Start training from score -1,508822


In [27]:
preds = model.predict(X_test)

In [28]:
acc_score, f1 = accuracy_score(y_test, preds), f1_score(y_test, preds, average='weighted')
print(f'Accuracy: {acc_score:.3f}, F1 Score: {f1:.3f}')

Accuracy: 0.929, F1 Score: 0.929


- XGBoost -> Accuracy: 0.930, F1 Score: 0.930
- LGBM -> Accuracy: 0.929, F1 Score: 0.929
- Logreg -> 