In [2]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
df = pd.read_csv("./Mental Health Dataset.csv")
df.head()

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
0,2014-08-27 11:29:31,Female,United States,Corporate,,No,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
1,2014-08-27 11:31:50,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No
2,2014-08-27 11:32:39,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
3,2014-08-27 11:37:59,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,2014-08-27 11:43:36,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes


In [4]:
#Capitalizing first letter of all columns
df.columns = df.columns.str.capitalize()
print(df.columns)

Index(['Timestamp', 'Gender', 'Country', 'Occupation', 'Self_employed',
       'Family_history', 'Treatment', 'Days_indoors', 'Growing_stress',
       'Changes_habits', 'Mental_health_history', 'Mood_swings',
       'Coping_struggles', 'Work_interest', 'Social_weakness',
       'Mental_health_interview', 'Care_options'],
      dtype='object')


In [5]:
#Drop Timestamp column
df.drop('Timestamp', axis=1, inplace=True)
df.head()

Unnamed: 0,Gender,Country,Occupation,Self_employed,Family_history,Treatment,Days_indoors,Growing_stress,Changes_habits,Mental_health_history,Mood_swings,Coping_struggles,Work_interest,Social_weakness,Mental_health_interview,Care_options
0,Female,United States,Corporate,,No,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
1,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No
2,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
3,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes


In [6]:
df.drop('Self_employed', axis=1, inplace=True)
df.head()

Unnamed: 0,Gender,Country,Occupation,Family_history,Treatment,Days_indoors,Growing_stress,Changes_habits,Mental_health_history,Mood_swings,Coping_struggles,Work_interest,Social_weakness,Mental_health_interview,Care_options
0,Female,United States,Corporate,No,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
1,Female,United States,Corporate,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No
2,Female,United States,Corporate,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
3,Female,United States,Corporate,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,Female,United States,Corporate,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes


In [7]:
#Converting Categorical variables to numerical using Label Encoding
LE = LabelEncoder()
for column in df.columns:
    df[column] = LE.fit_transform(df[column])

df.head()

Unnamed: 0,Gender,Country,Occupation,Family_history,Treatment,Days_indoors,Growing_stress,Changes_habits,Mental_health_history,Mood_swings,Coping_struggles,Work_interest,Social_weakness,Mental_health_interview,Care_options
0,0,34,1,0,1,0,3,1,2,2,0,1,2,1,1
1,0,34,1,1,1,0,3,1,2,2,0,1,2,1,0
2,0,34,1,1,1,0,3,1,2,2,0,1,2,1,2
3,0,34,1,1,1,0,3,1,2,2,0,1,2,0,2
4,0,34,1,1,1,0,3,1,2,2,0,1,2,1,2


In [8]:
#Split the dataset into features and target variable
X = df.drop('Treatment', axis=1)
Y = df['Treatment']

In [9]:
#Split the data into training and testing dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=40)

In [10]:
#Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
#Inialize and train the GradientBoost
start_time = time.time()
model =GradientBoostingClassifier(random_state=42)
model.fit(X_train, Y_train)
end_time = time.time()
time1 = end_time-start_time

In [14]:
#Make predictions on the test dataset
y_pred = model.predict(X_test)

In [15]:
#Evaluate the model
acc1 = accuracy_score(Y_test, y_pred)
print(accuracy_score(Y_test, y_pred))
print((classification_report(Y_test, y_pred)))
print(confusion_matrix(Y_test, y_pred))

0.8094688221709007
              precision    recall  f1-score   support

           0       0.83      0.52      0.64      3623
           1       0.81      0.95      0.87      7635

    accuracy                           0.81     11258
   macro avg       0.82      0.73      0.75     11258
weighted avg       0.81      0.81      0.80     11258

[[1869 1754]
 [ 391 7244]]


##**Other Commonly Used Models**

In [16]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
start_time = time.time()
model.fit(X_train, Y_train)
end_time = time.time()
y_pred = model.predict(X_test)
acc2=accuracy_score(Y_test, y_pred)
time2 = end_time-start_time
print(accuracy_score(Y_test, y_pred))

0.6922188665837626


In [17]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
start_time = time.time()
model.fit(X_train, Y_train)
end_time = time.time()
time3= end_time-start_time
y_pred = model.predict(X_test)
acc3=accuracy_score(Y_test, y_pred)
print(accuracy_score(Y_test, y_pred))

0.70811867116717


In [18]:
from sklearn import svm
model = svm.SVC()
start_time = time.time()
model.fit(X_train, Y_train)
end_time = time.time()
time4= end_time-start_time
y_pred = model.predict(X_test)
acc4=accuracy_score(Y_test, y_pred)
print(accuracy_score(Y_test, y_pred))

0.773583229703322


In [19]:
from sklearn import tree
model = tree.DecisionTreeClassifier()
start_time = time.time()
model.fit(X_train, Y_train)
end_time = time.time()
time5= end_time-start_time
y_pred = model.predict(X_test)
acc5=accuracy_score(Y_test, y_pred)
print(accuracy_score(Y_test, y_pred))

0.766743648960739


## Using a well Defined Neural Network for around 150 epochs, we tend to achieve an atmost accuracy of 80%

In [20]:
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import numpy as np

# Split the dataset
# 60% train, 20% test, 20% validation
X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Build the model
model = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define the fit function
def fit_model(model, X_train, y_train, epochs=150, validation_data=(X_val, y_val)):
    model.fit(X_train, y_train, epochs=epochs, validation_data=validation_data)

# Define the predict function
def predict_model(model, X_test):
    return model.predict(X_test)

# Define the accuracy function
def get_accuracy(model, X_test, y_test):
    predictions = predict_model(model, X_test)
    rounded_predictions = tf.round(predictions).numpy().flatten()
    correct_predictions = np.equal(rounded_predictions, y_test)
    accuracy = np.mean(correct_predictions)
    return accuracy

# Fit the model
start_time = time.time()
fit_model(model, X_train, y_train, validation_data=(X_val, y_val))
end_time = time.time()
time6= end_time-start_time

# Get the accuracy
accuracy = get_accuracy(model, X_test, y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")


Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [25]:
data = {
    'Model': ['*Gradient Boosting Algorithm*', 'NBC', 'KNN', 'SVM', 'DecisionTree', 'ANN'],
    'Accuracy': [acc1, acc2, acc3, acc4, acc5, accuracy],
    'Time Taken': [f"{time1:.2f}s", f"{time2:.2f}s", f"{time3:.2f}s", f"{time4:.2f}s", f"{time5:.2f}s", f"{time6:.2f}s"]
}
table = pd.DataFrame(data)
table

Unnamed: 0,Model,Accuracy,Time Taken
0,*Gradient Boosting Algorithm*,0.809469,3.09s
1,NBC,0.692219,0.02s
2,KNN,0.708119,0.16s
3,SVM,0.773583,136.04s
4,DecisionTree,0.766744,0.09s
5,ANN,0.784134,863.11s
