### Initial Setup

#### Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from os import getcwd

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#### Load Dataset (`data/Student Depression Dataset.csv`)

In [None]:
RANDOM_STATE = 513

cwd = getcwd()
data = "data"
raw_data_file = "Student Depression Dataset.csv"

df = pd.read_csv(f"{cwd}/{data}/{raw_data_file}", na_values="?").dropna() # drop any null values

df.info()
df.head()

#### Cleaning Data

In [None]:
df = df.drop("id", axis=1) # drop id (not needed for training)

# Convert these categorical yes/no questions to 1/0
df["Have you ever had suicidal thoughts ?"] = df["Have you ever had suicidal thoughts ?"].map(lambda x: 1 if x == "Yes" else 0)
df["Family History of Mental Illness"] = df["Family History of Mental Illness"].map(lambda x: 1 if x == "Yes" else 0)

df.info()
df.head()

#### Train-Test Split

In [None]:
attr = df.drop("Depression", axis=1)
target = df["Depression"]

attr_train, attr_test, target_train, target_test = train_test_split(attr, target, test_size=0.2, random_state=RANDOM_STATE)

#### Preprocessor Configuration

In [None]:
categorical_columns = attr.columns[attr.dtypes == "object"].tolist()
numerical_columns = attr.columns[attr.dtypes != "object"].tolist()
print(f"categorical columns = {categorical_columns}")
print(f"numerical columns = {numerical_columns}")

preprocessor = ColumnTransformer(transformers=[
    ('categorical', OneHotEncoder(handle_unknown="infrequent_if_exist"), categorical_columns),
    ('numerical', MinMaxScaler(), numerical_columns)
])

### Random Forest Classification

#### Model Configuration - Random Forest

In [None]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

#### Model Fitting & Predictions - Random Forest

In [None]:
model.fit(attr_train, target_train)
target_pred = model.predict(attr_test)

#### Metrics & Accuracy - Random Forest

In [None]:
acc = accuracy_score(target_test, target_pred)
cm = confusion_matrix(target_test, target_pred)
cr = classification_report(target_test, target_pred)

print(acc)
print(cm)
print(cr)

In [None]:
ax = plt.subplot()
sns.heatmap(cm, annot=True, fmt="d", ax=ax, cmap="crest")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

### CART Classification

#### Model Configuration - CART

In [None]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])

#### Model Fitting & Predictions - CART

In [None]:
model.fit(attr_train, target_train)
target_pred = model.predict(attr_test)

#### Metrics & Accuracy - CART

In [None]:
acc = accuracy_score(target_test, target_pred)
cm = confusion_matrix(target_test, target_pred)
cr = classification_report(target_test, target_pred)

print(acc)
print(cm)
print(cr)

In [None]:
ax = plt.subplot()
sns.heatmap(cm, annot=True, fmt="d", ax=ax, cmap="crest")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

### KNN Classification

#### Model Configuration - KNN

In [None]:
K = 5 # change value of K

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=K))
])

#### Model Fitting & Predictions - KNN

In [None]:
model.fit(attr_train, target_train)
target_pred = model.predict(attr_test)

#### Metrics & Accuracy - KNN

In [None]:
acc = accuracy_score(target_test, target_pred)
cm = confusion_matrix(target_test, target_pred)
cr = classification_report(target_test, target_pred)

print(acc)
print(cm)
print(cr)

In [None]:
ax = plt.subplot()
sns.heatmap(cm, annot=True, fmt="d", ax=ax, cmap="crest")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

### Naive Bayes Classification

#### Model Configuration - Naive Bayes

In [None]:
nb_preprocessor = ColumnTransformer(transformers=[
    ('categorical', OneHotEncoder(handle_unknown="infrequent_if_exist", sparse_output=False), categorical_columns),
    ('numerical', MinMaxScaler(), numerical_columns)
]) # spare_output = False to resolve errors

model = Pipeline(steps=[
    ('preprocessor', nb_preprocessor),
    ('classifier', GaussianNB()) # gaussian for numerical data
])

#### Model Fitting & Predictions - Naive Bayes

In [None]:
model.fit(attr_train, target_train)
target_pred = model.predict(attr_test)

#### Metrics & Accuracy - Naive Bayes

In [None]:
acc = accuracy_score(target_test, target_pred)
cm = confusion_matrix(target_test, target_pred)
cr = classification_report(target_test, target_pred)

print(acc)
print(cm)
print(cr)

In [None]:
ax = plt.subplot()
sns.heatmap(cm, annot=True, fmt="d", ax=ax, cmap="crest")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

### Logistic Regression Classification

#### Model Configuration - Logistic Regression

In [None]:
model = Pipeline(steps=[
    ('preprocessor', nb_preprocessor),
    ('classifier', LogisticRegression())
])

#### Model Fitting & Predictions - Logistic Regression

In [None]:
model.fit(attr_train, target_train)
target_pred = model.predict(attr_test)

#### Metrics & Accuracy - Logistic Regression

In [None]:
acc = accuracy_score(target_test, target_pred)
cm = confusion_matrix(target_test, target_pred)
cr = classification_report(target_test, target_pred)

print(acc)
print(cm)
print(cr)

In [None]:
ax = plt.subplot()
sns.heatmap(cm, annot=True, fmt="d", ax=ax, cmap="crest")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

### ANN Classification

#### Model Configuration - ANN

In [None]:
model = Pipeline(steps=[
    ('preprocessor', nb_preprocessor),
    ('classifier', MLPClassifier(hidden_layer_sizes=(4,), max_iter=10000, random_state=RANDOM_STATE))
])

#### Model Fitting & Predictions - ANN

In [None]:
model.fit(attr_train, target_train)
target_pred = model.predict(attr_test)

#### Metrics & Accuracy - ANN

In [None]:
acc = accuracy_score(target_test, target_pred)
cm = confusion_matrix(target_test, target_pred)
cr = classification_report(target_test, target_pred)

print(acc)
print(cm)
print(cr)

In [None]:
ax = plt.subplot()
sns.heatmap(cm, annot=True, fmt="d", ax=ax, cmap="crest")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
sns.lineplot(x=range(len(model.named_steps["classifier"].loss_curve_)), y=model.named_steps["classifier"].loss_curve_)
plt.title('Loss Curve')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()