# a.i. Depression Classification

In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score

# Cleaning

In [3]:
#Function to clean the data
def data_cleaning(file_path):
    df = pd.read_csv(file_path, index_col=0)
    df.dropna(inplace=True)
    
    return df

In [4]:
train_file = "../Datasets/ADS/final_training_data.csv"
test_file = "../Datasets/ADS/final_testing_data.csv"

#After data exploring we found that one of the data sample's turn was all NA. So we removed it
train_data_source = data_cleaning(train_file)
test_data = data_cleaning(test_file)

# Training

In [5]:
from sklearn.model_selection import train_test_split

#Select the features and target variables from the train set
X_tr = train_data_source.drop(['Depression', 'Gender', 'participant_id'], axis=1)
y_tr = train_data_source['Depression']

#Normalize the data using standard scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_tr = scaler.fit_transform(X_tr)

#Split the data into training and validation set
X_train, X_val, y_train, y_val = train_test_split(X_tr, y_tr, test_size=0.2, random_state=42)


In [6]:
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(10900, 88) (2725, 88) (10900,) (2725,)


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_features': ['sqrt', 'log2'],  # Number of features to consider at every split
    'max_depth': [None, 20, 40],  # Maximum number of levels in tree
    'min_samples_split': [5, 10]  # Minimum number of samples required to split a node
}

#Initialize the classifier
rf = RandomForestClassifier(random_state=42)

#Setup the grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1)

#Fit grid_search to the data
grid_search.fit(X_train, y_train)


#Best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

#Best model from grid search
best_rf = grid_search.best_estimator_

#Predict on the validation set
y_pred = best_rf.predict(X_val)

#Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 300}
Validation Accuracy: 0.8469724770642202


In [13]:
rf_best = RandomForestClassifier(max_depth=None, max_features= 'sqrt', min_samples_split= 5, 
                                    n_estimators=300, random_state=42)
rf_best.fit(X_train, y_train)

# Performing predictions on the test data

In [22]:
#Function to aggregate turn-based decisions at the participant level
def aggregate_decisions(decisions):
    return int(decisions.mean() >= 0.5)

In [23]:
#Select the features and target variables from the test set
X_test = test_data.drop(['Depression', 'Gender', 'participant_id'], axis=1)
y_test = test_data['Depression']

#Normalize the test data using the scaler fitted on the training data
X_test = scaler.transform(X_test)

#Make predictions on the test set
y_pred_test = rf_best.predict(X_test)

In [24]:
'''
Aggregrate predictions on a participant level
'''
#Get the participant IDs and genders for the test set
participant_ids_test = test_data['participant_id']
genders_test = test_data['Gender']

#Aggregate the predictions at the participant level
participant_preds_test = {} #Have turns predictions for each participant
for participant_id, pred in zip(participant_ids_test, y_pred_test):
    if participant_id not in participant_preds_test:
        participant_preds_test[participant_id] = []
    participant_preds_test[participant_id].append(pred)

participant_final_preds_test = {} #Store aggregrated results
for participant_id, preds in participant_preds_test.items():
    participant_final_preds_test[participant_id] = aggregate_decisions(pd.Series(preds))

In [26]:
#Evaluate the predictions against the true labels
y_true_test = y_test.groupby(participant_ids_test).mean().round().astype(int)
y_pred_final_test = pd.Series(participant_final_preds_test)

In [27]:
#Calculate accuracy and balanced accuracy
accuracy_test = accuracy_score(y_true_test, y_pred_final_test)
balanced_accuracy_test = balanced_accuracy_score(y_true_test, y_pred_final_test)

#Calculate equality of opportunity (EO) for the test set
genders_test_aggregated = genders_test.groupby(participant_ids_test).first()
male_mask_test = (genders_test_aggregated == 1)
female_mask_test = (genders_test_aggregated == 0)

tpr_male_test = accuracy_score(y_true_test[male_mask_test], y_pred_final_test[male_mask_test])
tpr_female_test = accuracy_score(y_true_test[female_mask_test], y_pred_final_test[female_mask_test])

eo_test = 1 - abs(tpr_male_test - tpr_female_test)

In [31]:
#Print the evaluation metrics for the test set
print("Test Accuracy:", accuracy_test)
print("Test Balanced Accuracy:", round(balanced_accuracy_test,3))
print("Test Equality of Opportunity (EO):", round(eo_test,3))

#Print the evaluation metrics for male and female participants separately
print("Male Accuracy:", round(tpr_male_test,3))
print("Female Accuracy:", round(tpr_female_test,3))
print("Male Balanced Accuracy:", round(balanced_accuracy_score(y_true_test[male_mask_test], y_pred_final_test[male_mask_test]),3))
print("Female Balanced Accuracy:", balanced_accuracy_score(y_true_test[female_mask_test], y_pred_final_test[female_mask_test]))

Test Accuracy: 0.65
Test Balanced Accuracy: 0.464
Test Equality of Opportunity (EO): 0.542
Male Accuracy: 0.833
Female Accuracy: 0.375
Male Balanced Accuracy: 0.455
Female Balanced Accuracy: 0.5
