In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Data Preprocessing

In [172]:
df=pd.read_csv('Student Depression Dataset.csv')
df

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.90,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27896,140685,Female,27.0,Surat,Student,5.0,0.0,5.75,5.0,0.0,5-6 hours,Unhealthy,Class 12,Yes,7.0,1.0,Yes,0
27897,140686,Male,27.0,Ludhiana,Student,2.0,0.0,9.40,3.0,0.0,Less than 5 hours,Healthy,MSc,No,0.0,3.0,Yes,0
27898,140689,Male,31.0,Faridabad,Student,3.0,0.0,6.61,4.0,0.0,5-6 hours,Unhealthy,MD,No,12.0,2.0,No,0
27899,140690,Female,18.0,Ludhiana,Student,5.0,0.0,6.88,2.0,0.0,Less than 5 hours,Healthy,Class 12,Yes,10.0,5.0,No,1


In [173]:
df_filtered=df.drop(columns=['id','Job Satisfaction','Work Pressure'])

In [174]:
df_filtered = df_filtered[df_filtered['Profession'] == 'Student']
df_filtered=df_filtered.drop(columns=['Profession'])

In [175]:
top_cities = df_filtered['City'].value_counts().nlargest(10).index
df_filtered['City'] = df_filtered['City'].apply(lambda x: x if x in top_cities else 'Other')

top_degree = df_filtered['Degree'].value_counts().nlargest(10).index
df_filtered['Degree'] = df_filtered['Degree'].apply(lambda x: x if x in top_degree else 'Other')


In [176]:
df_filtered.isna().values.any()
df_filtered=df_filtered.dropna()
df_filtered.isna().values.any()

False

In [177]:
X = df_filtered.iloc[:, 0:-1].values
y = df_filtered.iloc[:, -1].values

In [178]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
columns_to_encode = [0, -1, -4]
for col in columns_to_encode:
    X[:, col] = le.fit_transform(X[:, col])

In [179]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2,6,7,8])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [180]:
X

array([[0.0, 0.0, 0.0, ..., 3.0, 1.0, 0],
       [0.0, 0.0, 0.0, ..., 3.0, 2.0, 1],
       [0.0, 0.0, 0.0, ..., 9.0, 1.0, 1],
       ...,
       [0.0, 0.0, 0.0, ..., 12.0, 2.0, 0],
       [0.0, 0.0, 0.0, ..., 10.0, 5.0, 0],
       [0.0, 0.0, 0.0, ..., 2.0, 3.0, 1]], dtype=object)

In [181]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [182]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Logistic Regression

In [86]:
from sklearn.linear_model import LogisticRegression
classifierLR = LogisticRegression(random_state = 0)
classifierLR.fit(X_train, y_train)

In [87]:
yLR_pred=classifierLR.predict(X_test)
np.concatenate((yLR_pred.reshape(len(yLR_pred),1), y_test.reshape(len(y_test),1)),1)

array([[1, 1],
       [0, 0],
       [0, 1],
       ...,
       [1, 1],
       [1, 1],
       [0, 0]], dtype=int64)

In [88]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, yLR_pred)
print(cm)
accuracy_score(y_test, yLR_pred)

[[1797  489]
 [ 381 2907]]


0.8439181916038752

# K-NN

In [89]:
from sklearn.neighbors import KNeighborsClassifier
classifierKNN = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifierKNN.fit(X_train, y_train)

In [90]:
yKNN_pred = classifierKNN.predict(X_test)
np.concatenate((yKNN_pred.reshape(len(yKNN_pred),1), y_test.reshape(len(y_test),1)),1)

array([[1, 1],
       [0, 0],
       [0, 1],
       ...,
       [1, 1],
       [1, 1],
       [0, 0]], dtype=int64)

In [91]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, yKNN_pred)
print(cm)
accuracy_score(y_test, yKNN_pred)

[[1521  765]
 [ 439 2849]]


0.7839971295299605

# SVM

In [95]:
from sklearn.svm import SVC
classifierSVM = SVC(kernel = 'linear', random_state = 0)
classifierSVM.fit(X_train, y_train)

In [97]:
ySVM_pred = classifierSVM.predict(X_test)
np.concatenate((ySVM_pred.reshape(len(ySVM_pred),1), y_test.reshape(len(y_test),1)),1)

array([[1, 1],
       [0, 0],
       [0, 1],
       ...,
       [1, 1],
       [1, 1],
       [0, 0]], dtype=int64)

In [98]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, ySVM_pred)
print(cm)
accuracy_score(y_test, ySVM_pred)

[[1795  491]
 [ 371 2917]]


0.8453534266236096

# Naive Bayes

In [100]:
from sklearn.naive_bayes import GaussianNB
classifierNB = GaussianNB()
classifierNB.fit(X_train, y_train)

In [101]:
yNB_pred = classifierNB.predict(X_test)
np.concatenate((yNB_pred.reshape(len(yNB_pred),1), y_test.reshape(len(y_test),1)),1)

array([[1, 1],
       [0, 0],
       [0, 1],
       ...,
       [1, 1],
       [1, 1],
       [0, 0]], dtype=int64)

In [102]:
cm = confusion_matrix(y_test, yNB_pred)
print(cm)
accuracy_score(y_test, yNB_pred)

[[1787  499]
 [ 504 2784]]


0.8200574094007894

# Decision Tree Classification

In [103]:
from sklearn.tree import DecisionTreeClassifier
classifierDT = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifierDT.fit(X_train, y_train)

In [104]:
yDT_pred = classifierDT.predict(X_test)
np.concatenate((yDT_pred.reshape(len(yDT_pred),1), y_test.reshape(len(y_test),1)),1)

array([[1, 1],
       [0, 0],
       [1, 1],
       ...,
       [1, 1],
       [1, 1],
       [0, 0]], dtype=int64)

In [105]:
cm = confusion_matrix(y_test, yDT_pred)
print(cm)
accuracy_score(y_test, yDT_pred)

[[1663  623]
 [ 641 2647]]


0.7732328668819519

# Random Forest Classification

In [124]:
from sklearn.ensemble import RandomForestClassifier
classifierRF = RandomForestClassifier(n_estimators = 8, criterion = 'entropy', random_state = 0)#8 estimators is the best number connecting with entropy
classifierRF.fit(X_train, y_train)

In [125]:
yRF_pred = classifierRF.predict(X_test)
np.concatenate((yRF_pred.reshape(len(yRF_pred),1), y_test.reshape(len(y_test),1)),1)

array([[1, 1],
       [0, 0],
       [0, 1],
       ...,
       [1, 1],
       [1, 1],
       [0, 0]], dtype=int64)

In [126]:
cm = confusion_matrix(y_test, yRF_pred)
print(cm)
accuracy_score(y_test, yRF_pred)

[[1829  457]
 [ 552 2736]]


0.8189809831359885

# Building the ANN

In [222]:
ann = tf.keras.models.Sequential()

In [223]:
ann.add(tf.keras.layers.Dense(units=8, activation='relu'))

In [224]:
ann.add(tf.keras.layers.Dense(units=8, activation='relu'))

In [225]:
ann.add(tf.keras.layers.Dense(units=6, activation='sigmoid'))

In [226]:
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [227]:
ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [228]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=7)
ann.fit(X_train, y_train, batch_size=32, epochs=150, callbacks=[early_stopping])

Epoch 1/150
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 642us/step - accuracy: 0.7143 - loss: 0.5781
Epoch 2/150
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 652us/step - accuracy: 0.8462 - loss: 0.3758
Epoch 3/150
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 660us/step - accuracy: 0.8494 - loss: 0.3670
Epoch 4/150
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 673us/step - accuracy: 0.8514 - loss: 0.3533
Epoch 5/150
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 668us/step - accuracy: 0.8522 - loss: 0.3552
Epoch 6/150
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 652us/step - accuracy: 0.8496 - loss: 0.3508
Epoch 7/150
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 656us/step - accuracy: 0.8495 - loss: 0.3523
Epoch 8/150
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 661us/step - accuracy: 0.8452 - loss: 0.3544
Epoch 9/150
[1m

<keras.src.callbacks.history.History at 0x242cbdf66f0>

In [229]:
y_pred = ann.predict(X_test)
y_pred = (y_pred > 0.5)
f1=f1_score(y_test, y_pred)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
print(f1)

[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 725us/step
[[1 1]
 [0 0]
 [0 1]
 ...
 [1 1]
 [1 1]
 [0 0]]
0.8679415750639964


In [230]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1815  471]
 [ 406 2882]]


0.8426623609616075

In [145]:
#2 hidden layers with relu and 6 x 2 Neurons, optimizer = 'adam', loss = 'binary_crossentropy', batch_size = 32, epochs = 100 ACCURACY = 84.3%
#2 hidden layers with relu and 8 x 2 Neurons, optimizer = 'adam', loss = 'binary_crossentropy', batch_size = 32, epochs = 44 ACCURACY = 84.4%

Name: tensorflow
Version: 2.17.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: C:\Users\kamil\anaconda3\envs\A1\Lib\site-packages
Requires: tensorflow-intel
Required-by: 
Note: you may need to restart the kernel to use updated packages.


# Data Preprocessing V2

In [142]:
df2=pd.read_csv('Student Depression Dataset.csv')

In [143]:
df2_filtered=df2.drop(columns=['id','Job Satisfaction','Work Pressure','City'])
df2_filtered = df2_filtered[df2_filtered['Profession'] == 'Student']
df2_filtered=df2_filtered.drop(columns=['Profession'])

top_degree = df2_filtered['Degree'].value_counts().nlargest(10).index
df2_filtered['Degree'] = df2_filtered['Degree'].apply(lambda x: x if x in top_degree else 'Other')

In [144]:
df2_filtered.isna().values.any()
df2_filtered=df2_filtered.dropna()
df2_filtered.isna().values.any()

False

In [145]:
X = df2_filtered.iloc[:, 0:-1].values
y = df2_filtered.iloc[:, -1].values
df2_filtered

Unnamed: 0,Gender,Age,Academic Pressure,CGPA,Study Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Male,33.0,5.0,8.97,2.0,5-6 hours,Healthy,Other,Yes,3.0,1.0,No,1
1,Female,24.0,2.0,5.90,5.0,5-6 hours,Moderate,Other,No,3.0,2.0,Yes,0
2,Male,31.0,3.0,7.03,5.0,Less than 5 hours,Healthy,Other,No,9.0,1.0,Yes,0
3,Female,28.0,3.0,5.59,2.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,Female,25.0,4.0,8.13,3.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27896,Female,27.0,5.0,5.75,5.0,5-6 hours,Unhealthy,Class 12,Yes,7.0,1.0,Yes,0
27897,Male,27.0,2.0,9.40,3.0,Less than 5 hours,Healthy,MSc,No,0.0,3.0,Yes,0
27898,Male,31.0,3.0,6.61,4.0,5-6 hours,Unhealthy,Other,No,12.0,2.0,No,0
27899,Female,18.0,5.0,6.88,2.0,Less than 5 hours,Healthy,Class 12,Yes,10.0,5.0,No,1


In [146]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
columns_to_encode = [0, -1, -4]
for col in columns_to_encode:
    X[:, col] = le.fit_transform(X[:, col])

In [147]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [5,6,7])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [148]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [149]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [150]:
#######

In [239]:
ann = tf.keras.models.Sequential()

In [240]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
ann.add(tf.keras.layers.Dense(units=4, activation='sigmoid'))
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [241]:
ann.compile(optimizer='Nadam', loss='binary_crossentropy', metrics=['accuracy'])

In [242]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=7)
ann.fit(X_train, y_train, batch_size=32, epochs=128, callbacks=[early_stopping])

Epoch 1/128
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 681us/step - accuracy: 0.5352 - loss: 0.7361
Epoch 2/128
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 662us/step - accuracy: 0.8473 - loss: 0.4869
Epoch 3/128
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 641us/step - accuracy: 0.8457 - loss: 0.4115
Epoch 4/128
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 642us/step - accuracy: 0.8527 - loss: 0.3773
Epoch 5/128
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 667us/step - accuracy: 0.8498 - loss: 0.3714
Epoch 6/128
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 668us/step - accuracy: 0.8449 - loss: 0.3701
Epoch 7/128
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 609us/step - accuracy: 0.8455 - loss: 0.3626
Epoch 8/128
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 669us/step - accuracy: 0.8465 - loss: 0.3597
Epoch 9/128
[1m

<keras.src.callbacks.history.History at 0x24b1ea30f50>

In [243]:
y_pred = ann.predict(X_test)
y_pred = (y_pred > 0.5)
f1=f1_score(y_test, y_pred)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
print(f1)

[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 671us/step
[[1 1]
 [0 0]
 [0 1]
 ...
 [1 1]
 [1 1]
 [0 0]]
0.8675716440422323


In [244]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1820  466]
 [ 412 2876]]


0.8424829565841406

In [46]:
#Without city in dataset and having ANN with 8 x 2 relu hidden and output on sigmoid during 31 st epochs got a result on 84,6% with F-Score: 0,87