## Import Packages

In [1]:
import os
import sys
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

from src.transformers.custom_transformers import ScaleNumericFeatures
from src.transformers.feature_engineering import InteractionFeatureEngineer
from src.transformers.preprocessors import DataPreprocessor
from src.transformers.encoders import LabelEncoderTransformer, TargetEncoder
from utils.data import load_data

## Load Dataset

In [2]:
data = load_data()
data = DataPreprocessor().fit_transform(data)
data.head()

[INFO] 2024-11-24 02:32:34: Data Preprocessing - Preprocessing data...[0m
[INFO] 2024-11-24 02:32:34: Data Preprocessing - Converting data types[0m
[INFO] 2024-11-24 02:32:34: Data Preprocessing - Converting data types successful[0m
[INFO] 2024-11-24 02:32:34: Data Preprocessing - Handling missing values[0m
[INFO] 2024-11-24 02:32:34: Data Preprocessing - Handling missing values successful![0m
[INFO] 2024-11-24 02:32:34: Data Preprocessing - Handling outliers...[0m
[INFO] 2024-11-24 02:32:35: Data Preprocessing - Handling outliers successful![0m
[INFO] 2024-11-24 02:32:35: Data Preprocessing - Data preprocessing successful![0m


Unnamed: 0,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,49.0,Ludhiana,Working Professional,Chef,Not Applicable,5.0,-1.0,Not Applicable,2.0,More than 8 hours,Healthy,No,1.0,2.0,No,0
1,26.0,Varanasi,Working Professional,Teacher,Not Applicable,4.0,-1.0,Not Applicable,3.0,Less than 5 hours,Unhealthy,Yes,7.0,3.0,No,1
2,33.0,Visakhapatnam,Student,Student,5.0,Not Applicable,8.97,2.0,Not Applicable,5-6 hours,Healthy,Yes,3.0,1.0,No,1
3,22.0,Mumbai,Working Professional,Teacher,Not Applicable,5.0,-1.0,Not Applicable,1.0,Less than 5 hours,Moderate,Yes,10.0,1.0,Yes,1
4,30.0,Kanpur,Working Professional,Business Analyst,Not Applicable,1.0,-1.0,Not Applicable,1.0,5-6 hours,Unhealthy,Yes,9.0,4.0,Yes,0


# Normalize/Scale Numerical Features
The numeric features, `CGPA` and `Age`, will be scaled using the `MinMaxScaler`. For `CGPA`, only student data will undergo scaling, while the placeholder value for working professionals (`-1`) will remain unchanged.

In [3]:
data = ScaleNumericFeatures().fit_transform(data)

# Encode Categorical Variables
The categorical features will be encoded as follows:
- One-Hot Encoding will be applied to categories such as Gender, Working Professional or Student, Have you ever had suicidal thoughts?, Sleep Duration, Dietary Habits, and Family History of Mental Illness
- Target Encoding (using mean values) will be used for City, Profession, and Degree
- Label Encoding will be applied to categories like Academic Pressure, Work Pressure, Study Satisfaction, Job Satisfaction, Work/Study Hours, and Financial Stress

In [4]:
one_hot_encoded_features = ['Working Professional or Student', 'Have you ever had suicidal thoughts ?',
                                'Sleep Duration', 'Dietary Habits', 'Family History of Mental Illness']
target_encoded_features = ['City', 'Profession']
label_encoded_features = ['Academic Pressure', 'Work Pressure', 'Study Satisfaction', 'Job Satisfaction',
                          'Work/Study Hours', 'Financial Stress']

data = TargetEncoder(target_encoded_features).fit_transform(data, data['Depression'])
data = LabelEncoderTransformer(label_encoded_features).fit_transform(data)

onehot_encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the selected columns
encoded_data = onehot_encoder.fit_transform(data[one_hot_encoded_features])

# Create a DataFrame for the encoded columns
encoded_df = pd.DataFrame(
    encoded_data,
    columns=onehot_encoder.get_feature_names_out(one_hot_encoded_features),
    index=data.index
)
# Combine with original DataFrame
data = pd.concat([data.drop(columns=one_hot_encoded_features), encoded_df], axis=1)

# Create New Features
## Interaction Features
The following feature interactions will be generated and they aim to uncover significant patterns that may contribute to depression:
1. **CGPA × Study Satisfaction:** A high CGPA paired with low study satisfaction could signal academic pressure, a potential trigger for depression and stress in students.
2. **Work Pressure × Financial Stress:** Captures the dual burden of workplace demands and financial difficulties, highlighting individuals at risk of compounded stress.
3. **Job Satisfaction × Sleep Duration:** Explores the link between job dissatisfaction and poor sleep quality, which together can significantly impact mental well-being.
4. **Academic Pressure × Suicidal Thoughts:** Focuses on students under extreme stress, identifying those at higher risk of severe mental health challenges, including suicidal ideation.
5. **Dietary Habits × Financial Stress:** Examines how financial stress might lead to unhealthy dietary choices, potentially worsening physical and mental health.
6. **Age × Sleep Duration:** Examines how age influences sleep patterns, potentially uncovering age groups at higher risk of sleep deprivation, which is a known factor for poor mental health.
7. **Working Professional or Student × Work/Study Hours:** Investigates how work or study hours differ between professionals and students, potentially identifying groups overburdened by time commitments.
8. **Profession × Job Satisfaction:** Captures variations in job satisfaction across professions, identifying industries where dissatisfaction may signal systemic issues impacting mental health.
9. **Work Pressure × Suicidal Thoughts:** Explores the extreme mental health risks associated with high work stress, identifying individuals who might benefit from targeted interventions.
10. **Study Satisfaction × Financial Stress:** Investigates the link between study satisfaction and financial strain, potentially identifying students whose financial difficulties impact their academic experience.
11. **Sleep Duration × Academic/Work Pressure:** Explores how sleep patterns are affected by stress from academics or work, with implications for productivity and mental health.
12. **Dietary Habits × Sleep Duration:** Examines how dietary choices influence sleep quality, offering holistic insights into the interplay between nutrition and rest.
13. **Suicidal Thoughts × Financial Stress:** Captures the role of financial difficulties in triggering suicidal ideation, emphasizing the need for financial and mental health support
14. **Work/Study Hours × Sleep Duration:** Analyzes how long work or study hours reduce sleep, potentially exacerbating physical and mental health issues

In [5]:
data = InteractionFeatureEngineer(from_pipeline=False).fit_transform(data)

Y None


In [6]:
data.head()

Unnamed: 0,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Work/Study Hours,Financial Stress,...,Sleep Duration_Other x Dietary Habits_Moderate,Sleep Duration_Other x Dietary Habits_Unhealthy,Sleep Duration_Other x Dietary Habits_Other,Financial Stress x Have you ever had suicidal thoughts ?_No,Financial Stress x Have you ever had suicidal thoughts ?_Yes,Work/Study Hour x Sleep Duration_5-6 hours,Work/Study Hour x Sleep Duration_7-8 hours,Work/Study Hour x Sleep Duration_Less than 5 hours,Work/Study Hour x Sleep Duration_More than 8 hours,Work/Study Hour x Sleep Duration_Other
0,0.738095,0.19269,0.048567,0,5,-1.0,0,2,1,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.190476,0.142206,0.055649,0,4,-1.0,0,3,10,2,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,10.0,0.0,0.0
2,0.357143,0.159196,0.585061,5,0,0.792757,2,0,6,0,...,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0
3,0.095238,0.131293,0.055649,0,5,-1.0,0,1,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.285714,0.125739,0.056628,0,1,-1.0,0,1,12,3,...,0.0,0.0,0.0,0.0,3.0,12.0,0.0,0.0,0.0,0.0


In [None]:
import seaborn as sns

sns.heatmap(data, 
            annot=True, 
            fmt=".1f",    # Format for annotation (1 decimal point)
            linewidths=0.5, 
            cmap="coolwarm", 
            cbar_kws={"shrink": 0.8})  # Shrink color bar
plt.show()