In [1]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold 
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
from sklearn.linear_model import LinearRegression, LogisticRegression   
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 50)

In [2]:
df = pd.read_csv(r'D:\ds_ridwan\mainan-modular\data\train.csv')
df.drop(columns=['id'], inplace=True)
df.head()

Unnamed: 0,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [3]:
print(f'There is {df.shape[0]} rows and {df.shape[1]} cols in the dataset')

There is 140700 rows and 19 cols in the dataset


In [7]:
# see the percentage of missing values in each column that has missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / df.shape[0]) * 100
missing_percentage = missing_percentage[missing_percentage > 0].sort_values(ascending=False)
print(f"Percentage of missing values in each column:\n{missing_percentage}")

Percentage of missing values in each column:
Academic Pressure     80.172708
Study Satisfaction    80.172708
CGPA                  80.171997
Profession            26.034115
Work Pressure         19.842217
Job Satisfaction      19.836532
Dietary Habits         0.002843
Financial Stress       0.002843
Degree                 0.001421
dtype: float64


In [8]:
# seperate into numerical and categorical columns
num_cols = df.select_dtypes(include=['int','float']).columns.tolist()
cat_cols = df.select_dtypes(exclude=['int','float']).columns.tolist()
print(f'Numerical columns: {num_cols}\nCategorical columns: {cat_cols}')

Numerical columns: ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress', 'Depression']
Categorical columns: ['Name', 'Gender', 'City', 'Working Professional or Student', 'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']


In [11]:
# See each numerical cols and categorical, how many unique values they have
for col in num_cols:
    print(f'Columns: {col}\n--> Unique: {df[col].nunique()}')
print('=='*50)
for col in cat_cols:
    print(f'Columns: {col}\n--> Unique: {df[col].nunique()}')

Columns: Age
--> Unique: 43
Columns: Academic Pressure
--> Unique: 5
Columns: Work Pressure
--> Unique: 5
Columns: CGPA
--> Unique: 331
Columns: Study Satisfaction
--> Unique: 5
Columns: Job Satisfaction
--> Unique: 5
Columns: Work/Study Hours
--> Unique: 13
Columns: Financial Stress
--> Unique: 5
Columns: Depression
--> Unique: 2
Columns: Name
--> Unique: 422
Columns: Gender
--> Unique: 2
Columns: City
--> Unique: 98
Columns: Working Professional or Student
--> Unique: 2
Columns: Profession
--> Unique: 64
Columns: Sleep Duration
--> Unique: 36
Columns: Dietary Habits
--> Unique: 23
Columns: Degree
--> Unique: 115
Columns: Have you ever had suicidal thoughts ?
--> Unique: 2
Columns: Family History of Mental Illness
--> Unique: 2


In [13]:
# in categorical columns, let see the 10 most frequent values in each column
for col in cat_cols:
    print(f'Column: {col}\n -> 10 most frequent values:\n{df[col].value_counts().head(10)}\n')

Column: Name
 -> 10 most frequent values:
Name
Rohan          3178
Aarav          2336
Rupak          2176
Aaradhya       2045
Anvi           2035
Raghavendra    1877
Vani           1657
Tushar         1596
Ritvik         1589
Shiv           1568
Name: count, dtype: int64

Column: Gender
 -> 10 most frequent values:
Gender
Male      77464
Female    63236
Name: count, dtype: int64

Column: City
 -> 10 most frequent values:
City
Kalyan           6591
Patna            5924
Vasai-Virar      5765
Kolkata          5689
Ahmedabad        5613
Meerut           5528
Ludhiana         5226
Pune             5210
Rajkot           5207
Visakhapatnam    5176
Name: count, dtype: int64

Column: Working Professional or Student
 -> 10 most frequent values:
Working Professional or Student
Working Professional    112799
Student                  27901
Name: count, dtype: int64

Column: Profession
 -> 10 most frequent values:
Profession
Teacher             24906
Content Writer       7814
Architect            

## Langsung aja

In [14]:
# Feature Engineering
# Create an interaction term between Age and Work Pressure
df['Age_Work_Pressure'] = df['Age'] * df['Work Pressure']
# Create a new feature that is the ratio of Work Pressure to Age    
df['Work_Pressure_Age_Ratio'] = df['Work Pressure'] / (df['Age'] + 1e-5)  # Adding a small constant to avoid division by zero

In [15]:
from sklearn.preprocessing import OrdinalEncoder

In [16]:
# lesgo to preprocessing
def preprocess_data(df):

    # Define numerical transformer
    num_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler())
    ])

    # Define categorical transformer
    cat_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        # ('encoder', OneHotEncoder(handle_unknown='ignore'))
        # using ordinal encoding for categorical features
        ('encoder',OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    # Combine transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_transformer, num_cols),
            ('cat', cat_transformer, cat_cols)
        ]
    )

    return preprocessor

In [17]:
df.head(1)

Unnamed: 0,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,Age_Work_Pressure,Work_Pressure_Age_Ratio
0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0,245.0,0.102041


In [18]:
preprocess_data(df)

In [19]:
# Split the data into features and target variable  
X = df.drop(columns=['Depression'])
y = df['Depression']

In [20]:
X.head()

Unnamed: 0,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Age_Work_Pressure,Work_Pressure_Age_Ratio
0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,245.0,0.102041
1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,104.0,0.153846
2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,,
3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,110.0,0.227273
4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,30.0,0.033333


In [21]:
X.isna().sum()

Name                                          0
Gender                                        0
Age                                           0
City                                          0
Working Professional or Student               0
Profession                                36630
Academic Pressure                        112803
Work Pressure                             27918
CGPA                                     112802
Study Satisfaction                       112803
Job Satisfaction                          27910
Sleep Duration                                0
Dietary Habits                                4
Degree                                        2
Have you ever had suicidal thoughts ?         0
Work/Study Hours                              0
Financial Stress                              4
Family History of Mental Illness              0
Age_Work_Pressure                         27918
Work_Pressure_Age_Ratio                   27918
dtype: int64

## Buat pipeline gedenya

In [25]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier

In [26]:
model = Pipeline(steps=[
    ('preprocessor', preprocess_data(X)),
    # using voting classifier as the final estimator with multiple classifiers xgboost, lightgbm, random forest, and extra trees
    ('classifier',VotingClassifier([
        ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
        ('lgbm', LGBMClassifier()),
        ('rf', RandomForestClassifier()),
        ('et', ExtraTreesClassifier())
    ]))
])

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f'Train set shape: {X_train.shape}, Test set shape: {X_test.shape}')
print(f'Train target shape: {y_train.shape}, Test target shape: {y_test.shape}')

Train set shape: (112560, 20), Test set shape: (28140, 20)
Train target shape: (112560,), Test target shape: (28140,)


In [28]:
model

In [29]:
model.fit(X_train, y_train)

ValueError: A given column is not a column of the dataframe