In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
SEED = 42
pd.set_option("display.max_columns", None)

In [None]:
data = pd.read_csv("data.csv", delimiter="\t")
data

In [None]:
data.shape

### Remove these features as they are extra information collected with the survey and has no effect to person for being depressed
* QxE
* QxI
* introelapse
* testelapse
* surveyelapse
* engnat
* hand
* orientation
* voted
* screensize
* uniquenetworklocation
* source
* VCLx
* country

In [None]:
removedFeatures = [f"Q{i}E" for i in range(1, 43)]
removedFeatures.extend([f"Q{i}I" for i in range(1, 43)])
removedFeatures.extend([f"VCL{i}" for i in range(1, 17)])
removedFeatures.extend(
    [
        "source",
        "introelapse",
        "testelapse",
        "surveyelapse",
        "engnat",
        "hand",
        "orientation",
        "voted",
        "country",
        "screensize",
        "uniquenetworklocation",
    ]
)
depression = data.drop(removedFeatures, axis=1)
depression.to_csv("depression.csv", index=False)

## MODIFIED DATASET

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

depression = pd.read_csv("depression.csv")
depression.head()

## NULL VALUES

Only "Major" column has null values

In [None]:
plt.style.use("seaborn-v0_8")
missing_values = depression.isnull().sum()
missing_values_nonzero = missing_values[missing_values > 0]
plt.bar(missing_values_nonzero.index, missing_values_nonzero.values)
plt.ylabel("Number of Missing Values")
plt.title("Missing Values in Columns")
plt.show()

## Education

Analysis 

In [None]:
depression["education"] = depression["education"].map({0: 1, 1: 1, 2: 2, 3: 3, 4: 4})


def changeEducationTitle(title) -> str:
    if title == 0 or title == 1:
        return "Less than high school"
    if title == 2:
        return "High school"
    if title == 3:
        return "University degree"
    if title == 4:
        return "Graduate degree"
    return title


education_string = depression["education"].apply(changeEducationTitle)

plt.figure(figsize=(10, 5))
sns.countplot(x=depression["education"], hue=education_string)
plt.show()

### Major has null values and has no effect for person being depressed or not

In [None]:
depression.drop("major", inplace=True, axis=1)

## Urban

Analysis

In [None]:
depression["urban"] = depression["urban"].map({0: 3, 1: 1, 2: 2, 3: 3})


def changeUrbanValues(value):
    if value == 1:
        return "Rural (country side)"
    if value == 2:
        return "Suburban"
    if value == 3:
        return "Urban (town, city)"
    return value


urban = depression["urban"].apply(changeUrbanValues)

plt.figure(figsize=(10, 5))
sns.countplot(x=depression["urban"], hue=urban)
plt.show()

## Gender

Analysis


In [None]:
depression["gender"] = depression["gender"].map({0: 2, 1: 1, 2: 2, 3: 3})


def changeGenderValue(value):
    if value == 1:
        return "Male"
    if value == 2 or value == 0:
        return "Female"
    return "Other"


gender = depression["gender"].apply(changeGenderValue)

plt.figure(figsize=(10, 5))
sns.countplot(x=depression["gender"], hue=gender)
plt.show()

## Religion 

Analysis

In [None]:
def updateEducationValue(value):
    if value == 0:
        return 12
    return value


depression["religion"] = depression["religion"].apply(updateEducationValue)


def changeReliginValues(value) -> str:
    if value == 0:
        return "Other"
    if value == 1:
        return "Agnostic"
    if value == 2:
        return "Atheist"
    if value == 3:
        return "Buddhist"
    if value == 4:
        return "Christian (Catholic)"
    if value == 5:
        return "Christian (Mormon)"
    if value == 6:
        return "Christian (Protestant)"
    if value == 7:
        return "Christian (Other)"
    if value == 8:
        return "Hindu"
    if value == 9:
        return "Jewish"
    if value == 10:
        return "Muslim"
    if value == 11:
        return "Sikh"
    if value == 12:
        return "Other"
    return value


religin = depression["religion"].apply(changeReliginValues)
display(depression["religion"].value_counts())

plt.figure(figsize=(10, 5))
sns.countplot(x=depression["religion"], hue=religin)
plt.show()

## Race

Analysis

In [None]:
depression["race"] = depression["race"].apply(lambda x: x / 10)
depression["race"].head()


def changeRaceValues(value) -> str:
    if value == 1:
        return "Asian"
    if value == 2:
        return "Arab"
    if value == 3:
        return "Black"
    if value == 4:
        return "Indigenous Australian"
    if value == 5:
        return "Native American"
    if value == 6:
        return "White"
    if value == 7:
        return "Other"

    return value


race = depression["race"].apply(changeRaceValues)

display(depression["race"].value_counts())

plt.figure(figsize=(10, 5))
sns.countplot(x=depression["race"], hue=race)
plt.show()

## Personality 

Analysis


#### Extraverted and Enthusiastic

In [None]:
def changeFromToinTIPI(value, From, to):
    if value == From:
        return to
    return value


depression["TIPI1"] = depression["TIPI1"].apply(
    lambda value: changeFromToinTIPI(value, 0, 5)
)


def changeTIPIValues(value):
    if value == 1:
        return "Disagree strongly"
    if value == 2:
        return "Disagree moderately"
    if value == 3:
        return "Disagree a little"
    if value == 4:
        return "Neither agree nor disagree"
    if value == 5:
        return "Agree a little"
    if value == 6:
        return "Agree moderately"
    if value == 7:
        return "Agree strongly"

    return value


tipi = depression["TIPI1"].apply(changeTIPIValues)


plt.figure(figsize=(10, 5))
sns.countplot(x=depression["TIPI1"], hue=tipi)

plt.show()

### Critical and Quarrelsome

In [None]:
depression["TIPI2"] = depression["TIPI2"].apply(
    lambda value: changeFromToinTIPI(value, 0, 5)
)

tipi = depression["TIPI2"].apply(changeTIPIValues)

plt.figure(figsize=(10, 5))
sns.countplot(x=depression["TIPI2"], hue=tipi)
plt.show()

### Dependable and Self-disciplined.

In [None]:
depression["TIPI3"] = depression["TIPI3"].apply(
    lambda value: changeFromToinTIPI(value, 0, 6)
)

tipi = depression["TIPI3"].apply(changeTIPIValues)

plt.figure(figsize=(10, 5))
sns.countplot(x=depression["TIPI3"], hue=tipi)
plt.show()

### Anxious and Easily Upset

In [None]:
depression["TIPI4"] = depression["TIPI4"].apply(
    lambda value: changeFromToinTIPI(value, 0, 6)
)

tipi = depression["TIPI4"].apply(changeTIPIValues)

plt.figure(figsize=(10, 5))
sns.countplot(x=depression["TIPI4"], hue=tipi)
plt.show()

### Open to new experiences and complex

In [None]:
depression["TIPI5"] = depression["TIPI5"].apply(
    lambda value: changeFromToinTIPI(value, 0, 6)
)

tipi = depression["TIPI5"].apply(changeTIPIValues)

plt.figure(figsize=(10, 5))
sns.countplot(x=depression["TIPI5"], hue=tipi)
plt.show()

### Reserved and quiet

In [None]:
depression["TIPI6"] = depression["TIPI6"].apply(
    lambda value: changeFromToinTIPI(value, 0, 7)
)

tipi = depression["TIPI6"].apply(changeTIPIValues)

plt.figure(figsize=(10, 5))
sns.countplot(x=depression["TIPI6"], hue=tipi)
plt.show()

### Sympathetic and Warm

In [None]:
depression["TIPI7"] = depression["TIPI7"].apply(
    lambda value: changeFromToinTIPI(value, 0, 7)
)

tipi = depression["TIPI7"].apply(changeTIPIValues)

plt.figure(figsize=(10, 5))
sns.countplot(x=depression["TIPI7"], hue=tipi)
plt.show()

### Disorganized and Careless

In [None]:
depression["TIPI8"] = depression["TIPI8"].apply(
    lambda value: changeFromToinTIPI(value, 0, 7)
)

tipi = depression["TIPI8"].apply(changeTIPIValues)

plt.figure(figsize=(10, 5))
sns.countplot(x=depression["TIPI8"], hue=tipi)
plt.show()

### Calm and Emotionally stable

In [None]:
depression["TIPI9"] = depression["TIPI9"].apply(
    lambda value: changeFromToinTIPI(value, 0, 7)
)

tipi = depression["TIPI9"].apply(changeTIPIValues)

plt.figure(figsize=(10, 5))
sns.countplot(x=depression["TIPI9"], hue=tipi)
plt.show()

### Conventional and Uncreative

In [None]:
depression["TIPI10"] = depression["TIPI10"].apply(
    lambda value: changeFromToinTIPI(value, 0, 7)
)

tipi = depression["TIPI10"].apply(changeTIPIValues)

plt.figure(figsize=(10, 5))
sns.countplot(x=depression["TIPI10"], hue=tipi)
plt.show()

### Family Size 

* Analysis
* It seems that there are outliers, so we'll remove records that has familysize more than 13

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(x=depression["familysize"])
plt.show()

In [None]:
indexes = depression[depression["familysize"] > 13].index

print(f"Depression size before: {depression.shape[0]}")
depression = depression.drop(indexes, axis=0)
print(f"Depression size after: {depression.shape[0]}")
plt.figure(figsize=(10, 5))
sns.histplot(x=depression["familysize"])
plt.show()

## Married

Analysis


In [None]:
def change0to1inMarried(value):
    if value == 0:
        return 1
    return value


depression["married"] = depression["married"].apply(change0to1inMarried)


def changeMarriedValueToString(value):
    if value == 1:
        return "Never married"
    if value == 2:
        return "Currently married"
    if value == 3:
        return "Previously married"
    return value


married = depression["married"].apply(changeMarriedValueToString)

plt.figure(figsize=(10, 5))
sns.countplot(x=depression["married"], hue=married)
plt.show()

## Age

Analysis


In [None]:
display(depression["age"].value_counts())
plt.figure(figsize=(10, 5))
sns.histplot(x=depression["age"])
plt.show()

In [None]:
age_indexes = depression[depression["age"] > 80]["age"].index

display(age_indexes)

print(f"Depression size before: {depression.shape[0]}")
depression.drop(age_indexes, axis=0, inplace=True)
print(f"Depression size after: {depression.shape[0]}")
sns.histplot(x=depression["age"])
plt.show()

In [None]:
def makeAgeGroup(value):
    if value <= 10:
        return "Under 10"
    if 10 <= value <= 16:
        return "Primary Children"
    if 17 <= value <= 21:
        return "Secondary Children"
    if 21 <= value <= 35:
        return "Adults"
    if 36 <= value <= 48:
        return "Elder Adults"
    if value >= 49:
        return "Older People"


age = depression["age"].apply(makeAgeGroup)

plt.figure(figsize=(18, 7))
sns.countplot(x=depression["age"], hue=age)
plt.show()

In [None]:
def makeAgeGroupFeature(value):
    if value <= 10:
        return 1
    if 10 <= value <= 16:
        return 2
    if 17 <= value <= 21:
        return 3
    if 21 <= value <= 35:
        return 4
    if 36 <= value <= 48:
        return 5
    if value >= 49:
        return 6


depression["age_group"] = depression["age"].apply(makeAgeGroupFeature)

depression.drop("age", axis=1, inplace=True)

## DEPENDENT VARIABLE 

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
depression["total_count"] = depression.sum(axis=1)
depression["total_count"].describe()

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(x=depression["total_count"])
plt.show()
depression.head()

In [None]:
depression["total_count"].describe()

In [None]:
depression[depression["total_count"] < 170]["total_count"].describe()

In [None]:
depression[depression["total_count"] < 147]["total_count"].describe()

In [None]:
depression[depression["total_count"] > 170]["total_count"].describe()

In [None]:
def buildTargetMove15Steps(value):
    if value <= 143:
        return "Normal"
    if 143 < value <= 157:
        return "Mild"
    if 157 < value <= 180:
        return "Moderate"
    if 180 < value <= 204:
        return "Severe"
    if value > 204:
        return "Extremely Severe"


depression["target"] = depression["total_count"].apply(buildTargetMove15Steps)

plt.figure(figsize=(10, 5))
sns.countplot(x=depression["target"])
plt.show()

In [None]:
print(depression.head())

In [None]:
depression.to_csv("final.csv", index=False)

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

depression = pd.read_csv("final.csv")
depression

Unnamed: 0,Q1A,Q2A,Q3A,Q4A,Q5A,Q6A,Q7A,Q8A,Q9A,Q10A,...,education,urban,gender,religion,race,married,familysize,age_group,total_count,target
0,4,4,2,4,4,4,4,4,2,1,...,2,3,2,12,1.0,1,2,2,216.0,Extremely Severe
1,4,1,2,3,4,4,3,4,3,2,...,2,3,2,7,7.0,1,4,2,189.0,Severe
2,3,1,4,1,4,3,1,3,2,4,...,2,3,2,4,6.0,1,3,3,171.0,Moderate
3,2,3,2,1,3,3,4,2,3,3,...,1,3,2,4,7.0,1,5,2,153.0,Mild
4,2,2,3,4,4,2,4,4,4,3,...,3,2,2,10,1.0,1,4,3,212.0,Extremely Severe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39728,2,1,3,2,3,2,1,3,1,4,...,2,2,1,2,6.0,1,2,2,163.0,Moderate
39729,3,4,3,4,3,4,4,4,3,4,...,3,2,1,10,1.0,1,4,3,223.0,Extremely Severe
39730,2,1,2,1,1,1,1,1,2,1,...,3,2,2,7,3.0,2,3,5,128.0,Normal
39731,3,1,2,2,3,3,3,4,3,1,...,3,2,2,6,6.0,1,2,3,171.0,Moderate


In [3]:
target = depression["target"]
depression.drop(["target", "total_count"], axis=1, inplace=True)

In [4]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(depression, target, test_size=0.2)

print(f"x_train: {x_train.shape}, y_train: {y_train.shape}")
print(f"x_test: {x_test.shape}, y_test: {y_test.shape}")

x_train: (31786, 60), y_train: (31786,)
x_test: (7947, 60), y_test: (7947,)


In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [10]:
SEED = 42

# random forest

In [11]:
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rnd_clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rnd_clf.fit(x_train_scaled, y_train)

y_pred = rnd_clf.predict(x_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the test set: {accuracy:.4f}")
joblib.dump(rnd_clf, "random_forest.pkl")

Accuracy on the test set: 0.7871


['random_forest.pkl']

# Improved Random Forest

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
import random


param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
}

rnd_clf = RandomForestClassifier(n_jobs=-1, random_state=SEED)

grid_search = GridSearchCV(rnd_clf, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(x_train_scaled, y_train)

best_rnd_clf = grid_search.best_estimator_

y_pred = best_rnd_clf.predict(x_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the test set: {accuracy:.4f}")

joblib.dump(best_rnd_clf, "improved_random_forest_model.pkl")

Accuracy on the test set: 0.7897


['improved_random_forest_model.pkl']

# XG BOOST


In [16]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

xgb_clf = xgb.XGBClassifier(
    n_estimators=100, max_depth=3, learning_rate=0.1, random_state=SEED
)
xgb_clf.fit(x_train_scaled, y_train_encoded)

y_pred_encoded = xgb_clf.predict(x_test_scaled)
accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
print(f"Accuracy on the test set: {accuracy:.4f}")

model_filename = "xgb_model.pkl"
joblib.dump(xgb_clf, model_filename)
print(f"Model saved as '{model_filename}'")

Accuracy on the test set: 0.7545
Model saved as 'xgb_model.pkl'


# SVC

In [17]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import joblib

SEED = 42

svc_clf = SVC(random_state=SEED)
svc_clf.fit(x_train_scaled, y_train)

cross_score = cross_val_score(svc_clf, x_train_scaled, y_train, cv=5)
mean_cross_score = cross_score.mean()

y_pred = svc_clf.predict(x_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Mean Cross Validation Score: {mean_cross_score:.4f}")
print(f"Accuracy on the Test Set: {test_accuracy:.4f}")

joblib.dump(svc_clf, "svm_model.pkl")

Mean Cross Validation Score: 0.9661
Accuracy on the Test Set: 0.9684


['svm_model.pkl']