Import Libraries

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

Load and Explore the Data

In [9]:
# Load the dataset
df = pd.read_csv('depression.csv')

# Display first few rows and summary statistics
print("First few rows of the dataset:")
print(df.head())
print("\nDataset Summary:")
print(df.describe())
print("\nMissing Values in each column:")
print(df.isnull().sum())


First few rows of the dataset:
        Age  Gender Occupation        Days_Indoors Growing_Stress  \
0     20-25  Female  Corporate           1-14 days            Yes   
1  30-Above    Male     Others          31-60 days            Yes   
2  30-Above  Female    Student    Go out Every day             No   
3     25-30    Male     Others           1-14 days            Yes   
4     16-20  Female    Student  More than 2 months            Yes   

  Quarantine_Frustrations Changes_Habits Mental_Health_History Weight_Change  \
0                     Yes             No                   Yes           Yes   
1                     Yes          Maybe                    No            No   
2                      No            Yes                    No            No   
3                      No          Maybe                    No         Maybe   
4                     Yes            Yes                    No           Yes   

  Mood_Swings Coping_Struggles Work_Interest Social_Weakness  
0      Med

Data Preprocessing

In [12]:
df = df.ffill()

# Encode categorical features
categorical_cols = ['Age', 'Gender', 'Occupation', 'Days_Indoors',
                    'Growing_Stress', 'Quarantine_Frustrations', 'Changes_Habits',
                    'Mental_Health_History', 'Weight_Change', 'Mood_Swings',
                    'Coping_Struggles', 'Work_Interest', 'Social_Weakness']

le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Split features and target variable
X = df.drop('Growing_Stress', axis=1)  # Using 'Growing_Stress' as the target variable
y = df['Growing_Stress']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


Address Class Imbalance with SMOTE


In [13]:
# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Check the new class distribution
print("Class distribution after SMOTE:")
print(pd.Series(y_resampled).value_counts())

Class distribution after SMOTE:
Growing_Stress
2    301
1    301
0    301
Name: count, dtype: int64


Model Training with Hyperparameter Tuning for Random Forest

In [14]:
# Define parameter grid for GridSearchCV
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

# Initialize Random Forest model
rf = RandomForestClassifier(random_state=42)

# Grid search for best parameters
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_rf.fit(X_resampled, y_resampled)

# Print best parameters and best accuracy
print("Best Parameters for Random Forest:", grid_search_rf.best_params_)
print("Best Score for Random Forest:", grid_search_rf.best_score_)

# Train the Random Forest model with best parameters
best_rf = grid_search_rf.best_estimator_

Best Parameters for Random Forest: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 50}
Best Score for Random Forest: 0.45085328422344995
