**Installing necessary packages**

In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import warnings
from functools import partial
import os
from sklearn.metrics import accuracy_score, precision_score


**Data Wrangling (Preprocessing)**

In [84]:
class DataPreprocessor:
    def __init__(self, file_path):
        self.file_path = file_path
        self.data = None
        self.encoder = None
    def load_data(self):
        self.data = pd.read_csv(self.file_path)
        print("Data loaded successfully.")
    def preprocess_data(self):
        if self.data is None:
            raise ValueError("Data not loaded. Please call `load_data()` first.")

 # Data Wrangling (Preprocessing)
        self.data = self.data.drop(['id'], axis=1)
        self.data['Gender'] = self.data['Gender'].map({'Male': 0, 'Female': 1})

        city_counts = self.data['City'].value_counts()
        self.data = self.data[self.data['City'].isin(city_counts[city_counts >= 400].index)]

        self.data = self.data[self.data['Profession'] == 'Student'].drop(['Profession'], axis=1)

        self.data = self.data.drop(['Work Pressure'], axis=1)
        self.data = self.data[self.data['Age'] <= 30]
        self.data = self.data[self.data['Academic Pressure'] > 0]
        self.data = self.data[self.data['Study Satisfaction'] > 0]
        self.data = self.data.drop(['Job Satisfaction'], axis=1)

        sleep_map = {'Less than 5 hours': 0, '5-6 hours': 1, '7-8 hours': 2, 'More than 8 hours': 3}
        self.data = self.data[self.data['Sleep Duration'] != 'Others']
        self.data['Sleep Duration'] = self.data['Sleep Duration'].map(sleep_map)

        diet_map = {'Healthy': 0, 'Unhealthy': 1, 'Moderate': 2}
        self.data = self.data[self.data['Dietary Habits'] != 'Others']
        self.data['Dietary Habits'] = self.data['Dietary Habits'].map(diet_map)

        degree_mapping = {
            r'BSc|BCA|B.Ed|BHM|B.Pharm|B.Com|BE|BA|B.Arch|B.Tech|BBA|LLB': 'Graduated',
            r'MSc|MCA|M.Ed|M.Pharm|M.Com|ME|MA|M.Arch|M.Tech|MBA|LLM': 'Post Graduated',
            'Class 12': 'Higher Secondary'
        }
        for pattern, category in degree_mapping.items():
            self.data.loc[self.data['Degree'].str.contains(pattern, regex=True, na=False), 'New_Degree'] = category

        self.data = self.data[self.data['Degree'] != 'Others']
        new_degree_map = {'Graduated': 0, 'Post Graduated': 1, 'Higher Secondary': 2}
        self.data['New_Degree'] = self.data['New_Degree'].map(new_degree_map)

        self.data['Have you ever had suicidal thoughts ?'] = self.data['Have you ever had suicidal thoughts ?'].map({'Yes': 1, 'No': 0})
        self.data['Family History of Mental Illness'] = self.data['Family History of Mental Illness'].map({'Yes': 1, 'No': 0})

        self.data = self.data.dropna()

        self.encoder = OneHotEncoder(sparse_output=False)
        encoded_cities = self.encoder.fit_transform(self.data[['City']])
        city_encoded_df = pd.DataFrame(encoded_cities, columns=self.encoder.get_feature_names_out(['City']), index=self.data.index)
        self.data = pd.concat([self.data, city_encoded_df], axis=1).drop(['City', 'Degree'], axis=1)

        print("Preprocessing complete.")

    def save_data(self, output_path):
        """Save the preprocessed dataset to a CSV file."""
        if self.data is None:
            raise ValueError("No data to save. Please preprocess the data first.")

        self.data.to_csv(output_path, index=False)
        print(f"Data saved to {output_path}.")


**pre-process the dataset**

In [None]:
preprocessor = DataPreprocessor(file_path='/content/Student Depression Dataset.csv')
preprocessor.load_data()
preprocessor.preprocess_data()
preprocessor.save_data(output_path=f'{os.getcwd()}/Processed_StudentDepression.csv')

**Load the cleaned data**

In [None]:
df = pd.read_csv(f'{os.getcwd()}/Processed_StudentDepression.csv')
FIG_PATH = "./graphics/"

# Check if the directory exists
if not os.path.exists(FIG_PATH):
    # Create the directory
    os.makedirs(FIG_PATH)
    print(f"Directory {FIG_PATH} created.")
else:
    print(f"Directory {FIG_PATH} already exists.")

**Perform exploratory data analysis (EDA) using correlation matrice and heatmap**

In [None]:
main_data = df[['Gender', 'Age', 'Academic Pressure', 'CGPA',
       'Study Satisfaction', 'Sleep Duration', 'Dietary Habits',
       'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
       'Financial Stress', 'Family History of Mental Illness', 'Depression',
       'New_Degree']]
main_data.head(3)

plt.figure(figsize=(20, 10))

# Plot the heatmap with increased annotation font size
sns.heatmap(
    main_data.corr(),
    annot=True,
    cmap='coolwarm',
    annot_kws={"size": 12}  # Adjust the annotation font size
)

# Increase the size of the x and y-axis labels
plt.xticks(fontsize=14, rotation=45)  # Adjust fontsize and rotate labels
plt.yticks(fontsize=14)

# Save the plot to a file (e.g., as a PNG image)
plt.savefig(f'{FIG_PATH}mental_health_heatmap.png', bbox_inches='tight', dpi=300)
plt.show()

***Model training***

Split the data into train/test (80/20), standardize and test-run one classifier.

In [None]:
## SPLITTING THE DATA INTO FEATURES AND TARGET
X = df.drop('Depression', axis=1).values  # Features
y = df['Depression'].values  # Target

## SPLITTING THE DATA INTO TRAIN AND TEST
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## NORMALIZING THE DATA FEATURES USING STANDARD SCALER
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## TRAINING THE MODEL USING LOGISTIC REGRESSION
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

## SCORE
score = model.score(X_test_scaled, y_test)
print(f"Accuracy: {score*100:.2f}%")

**Plot confusion matrix for the single trained classifier**

In [None]:
## PLLOTING THE CONFUSION MATRIX
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted: 0', 'Predicted: 1'], yticklabels=['Real: 0', 'Real: 1'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Values')
plt.ylabel('Real Values')
# Save the plot to a file (e.g., as a PNG image)
plt.savefig(f'{FIG_PATH}mental_health_confusion_matrix.png', bbox_inches='tight', dpi=300)
plt.show()

**Perform grid search over the parameter spaces for each model**

In [None]:
import warnings

warnings.filterwarnings('ignore', category=ConvergenceWarning)

models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
   "LightGBM": lgb.LGBMClassifier(random_state=42, verbose=-1),
     "Gradient Boosting": GradientBoostingClassifier(random_state=42),
}

# Define the parameter grids for each model
param_grids = {
    "Logistic Regression": {
        "C": [0.01, 0.1, 1, 10],
        "solver": ["liblinear", "saga"],
        "max_iter": [100, 200, 300]
    },
        "Gradient Boosting": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 0.5],
        "max_depth": [3, 5, 10]
    },
    "Random Forest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 10, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    },
    "K-Nearest Neighbors": {
        "n_neighbors": [3, 5, 10],
        "weights": ["uniform", "distance"],
        "algorithm": ["auto", "ball_tree", "kd_tree"]
    },
    "LightGBM": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 0.5],
        "max_depth": [3, 5, 10]
    }
}

# Initialize a dictionary to store the best model and its accuracy
best_model_results = {}

for name, model in models.items():
    print(f"Running GridSearchCV for {name}...")

    # Create a GridSearchCV object for each model and its respective parameter grid
    grid_search = GridSearchCV(model, param_grids[name], cv=2, n_jobs=-1, scoring="accuracy")

    # Fit the grid search (ignoring warnings)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        grid_search.fit(X_train_scaled, y_train)

    # Get the best model and its accuracy
    best_model = grid_search.best_estimator_
    best_accuracy = grid_search.best_score_

    # Store the best model's accuracy and parameters
    best_model_results[name] = {
        "accuracy": best_accuracy,
        "best_params": grid_search.best_params_
    }

# Sort the models by accuracy
best_model_results_sorted = dict(sorted(best_model_results.items(), key=lambda item: item[1]['accuracy'], reverse=True))

# Print sorted results
print(best_model_results_sorted)


In [None]:
# Plotting the results
plt.figure(figsize=(15, 6))
# sns.barplot(x=[result["accuracy"] for result in best_model_results_sorted.values()],
#             y=best_model_results_sorted.keys(),
#             palette='Blues')

sns.barplot(
    x=[result["accuracy"] for result in best_model_results_sorted.values()],
    y=list(best_model_results_sorted.keys()),
    palette='Blues'
)

plt.xlabel('Accuracy', fontsize=14)
plt.title('Grid Search Model Comparison',fontsize=18)

# Increase the tick label font sizes, including model names
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)  # Adjust font size of the model names


# Add accuracy labels
for i, (name, result) in enumerate(best_model_results_sorted.items()):
    plt.text(result["accuracy"], i, f'{result["accuracy"]*100:.2f}%', color='black', va='center', fontsize=10)

# Save the plot to a file
plt.savefig(f'{FIG_PATH}mental_health_model_comparison_grid_search.png', bbox_inches='tight', dpi=300)
plt.show()

**Next we perform XAI on the top-performing model's**

LightGBM

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress specific deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning, message=".*force_all_finite.*")
warnings.filterwarnings("ignore", category=FutureWarning, message=".*ensure_all_finite.*")

# List of models used
model_names_str = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting',
                   'SVM', 'K-Nearest Neighbors', 'Naive Bayes', 'XGBoost', 'LightGBM']
print(model_names_str)

# Initialize the LightGBM model with the best parameters if available
model_name = "LightGBM"
best_params = best_model_results.get(model_name, {}).get("best_params", None)

# Get feature names (ensure 'Depression' is the target column)
feature_names = df.drop('Depression', axis=1).columns

# Initialize the LightGBM model with best params if available
if best_params:
    lgbm_model = lgb.LGBMClassifier(random_state=42, **best_params)
else:
    # Set default params if best_params is None
    lgbm_model = lgb.LGBMClassifier(random_state=42, learning_rate=0.1, max_depth=3, n_estimators=200)

# Train the model
lgbm_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = lgbm_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"LightGBM Accuracy: {accuracy*100:.2f}%")

# Visualize the feature importance
plt.figure(figsize=(12, 8))

# Get feature importances
importance = lgbm_model.feature_importances_

# Sort features by importance
indices = np.argsort(importance)[::-1]

# Select top 10 features
top_features = indices[:10]

# Create a horizontal bar plot
plt.barh(range(len(top_features)), importance[top_features], color='skyblue')

# Set y-axis labels to actual feature names
plt.yticks(range(len(top_features)), [feature_names[i] for i in top_features])

plt.title("LightGBM Feature Importance", fontsize=18)
plt.xlabel("Importance", fontsize=14)
plt.ylabel("Features", fontsize=14)

plt.tight_layout()
plt.grid()

# Save the plot as a PNG image
plt.savefig(f'{FIG_PATH}LightGBM_feature_importance_horizontal.png', bbox_inches='tight', dpi=300)
plt.show()

# Optional: Print out the top 10 features and their importance scores
for f in range(len(top_features)):
    print("%d. %s: %f" % (f + 1, feature_names[top_features[f]], importance[top_features[f]]))


Gradient Boosting

In [None]:
# Define model names and best parameters
model_names_str = list(models.keys())
print(model_names_str)

model_name = "Gradient Boosting"

# Get the best parameters for the Gradient Boosting model
best_params = best_model_results.get(model_name, {}).get("best_params", {})

# Get feature names
feature_names = df.drop('Depression', axis=1).columns

# Initialize the Gradient Boosting model with best parameters
gb_model = GradientBoostingClassifier(random_state=42, **best_params)

# Train the model
gb_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = gb_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Gradient Boosting Accuracy: {accuracy*100:.2f}%")

# Visualize the feature importance
plt.figure(figsize=(12, 8))

# Get feature importances
importance = gb_model.feature_importances_

# Sort features by importance
indices = np.argsort(importance)[::-1]

# Select top 10 features
top_features = indices[:10]

# Create a horizontal bar plot
plt.barh(range(len(top_features)), importance[top_features], color='lightgreen')

# Set y-axis labels to actual feature names
plt.yticks(range(len(top_features)), [feature_names[i] for i in top_features])

plt.title("Gradient Boosting Feature Importance", fontsize=18)
plt.xlabel("Importance", fontsize=14)
plt.ylabel("Features", fontsize=14)

plt.tight_layout()
plt.grid()

# Save the plot as a PNG image
plt.savefig('gradient_boosting_feature_importance_horizontal.png', bbox_inches='tight', dpi=300)
plt.show()

# Optional: Print out the top 10 features and their importance scores
for f in range(len(top_features)):
    print(f"{f + 1}. {feature_names[top_features[f]]}: {importance[top_features[f]]:.6f}")
