In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, r2_score, f1_score, recall_score, precision_score, confusion_matrix, classification_report

#from google.colab import drive
#import warnings
#warnings.filterwarnings('ignore')
%matplotlib inline
plt.rcParams.update({'figure.max_open_warning': 0}) # Hide warnings

In [2]:
# import the data into datafram
path ="C:/Users/pc/Desktop/1/my project/depression prediction/"
df = pd.read_csv(path + "data.csv")

In [4]:
df.drop_duplicates(inplace=True)

In [7]:
new_col = ['age','address','schooling','stud_hr','employed','h_disab',
           'ment_cond','social_hr','fit_hr','wind','dry_mouth',
           'positive','breath_diff','initiate','tremb','worry','look_fwd',
           'down','enthus','life_mean','scared','outcome']
df.columns= new_col

In [8]:
df.outcome.replace('high signs of depression ', 'High signs of depression ', inplace=True)

In [116]:
X = df.copy()
y = X.pop("outcome")

In [117]:
i = lambda x: x.split("-")[-1].strip("hours")[0]
X["social_hr"]=X["social_hr"].apply(i)
X["fit_hr"]=X["fit_hr"].apply(i)

In [140]:
X.drop(columns=["address", "schooling"], inplace=True)

In [141]:
mominal_cloumns_mask=X.select_dtypes("object").nunique()>2
mominal_cloumns=mominal_cloumns_mask.index[mominal_cloumns_mask][:-2]

In [142]:

ordinal_columns_mask=X.select_dtypes(["int64", "float64"]).nunique()==4
ordinal_columns=ordinal_columns_mask.index[ordinal_columns_mask]
for i in mominal_cloumns_mask.index[mominal_cloumns_mask][[3,4]]:
    ordinal_columns=ordinal_columns.insert(0, i)

In [143]:
numerical_columns_mask=X.select_dtypes(["int64", "float64"]).nunique()>4
numerical_columns=numerical_columns_mask.index[numerical_columns_mask]

In [144]:
for i in ordinal_columns:
    X[i] = X[i].astype("category")   

In [145]:
precessor = [
    ("ordinal columns", OrdinalEncoder(),ordinal_columns ),
    ("nominal columns", OneHotEncoder(),mominal_cloumns ),
    ("numerical columns", StandardScaler(),numerical_columns )
    
]

In [146]:
transformer = ColumnTransformer(precessor, remainder="passthrough")

In [158]:
classfier_names = ["RandomForest", "AdaBoost", "GradientBosst", "Logisticregression", "DecisionTree",
                  "GaussianNB"]

classfiers=[RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(),
LogisticRegression(max_iter=1000), DecisionTreeClassifier(), GaussianNB()]

In [159]:
pipelines =[ Pipeline([
    ("transformer", transformer), (classfier_name, classfier)
]) for classfier_name, classfier in zip(classfier_names, classfiers) ]

In [160]:
scoring = {"acc": "accuracy"}
training_scores = []
cv_scores = {classfier_name:[] for classfier_name in classfier_names}
for classfier_name, pipeline in zip(classfier_names, pipelines):
    cv_score = cross_validate(estimator=pipeline, X=X, y=y, cv=4, error_score='raise', scoring=scoring,
                         return_estimator=True, return_train_score=True)
    training_score = np.mean(cv_score["train_acc"]) * 100
    training_scores.append(training_score)
    cv_scores[classfier_name].append(cv_score)

In [161]:
training_scores

[97.25274725274726,
 65.84249084249085,
 97.25274725274726,
 83.25752116735724,
 97.25274725274726,
 67.48333633579536]

In [172]:
best_estimators = {classfier_name:[] for classfier_name in classfier_names}
for key, model in cv_scores.items():
    best_score=np.argmax(model[0]["test_acc"])
    best_estimator=model[0]["estimator"][best_score]
    best_estimators[key].append(best_estimator)

In [173]:
best_estimators

{'RandomForest': [Pipeline(steps=[('transformer',
                   ColumnTransformer(remainder='passthrough',
                                     transformers=[('ordinal columns',
                                                    OrdinalEncoder(),
                                                    Index(['fit_hr', 'social_hr', 'wind', 'dry_mouth', 'positive', 'breath_diff',
         'initiate', 'tremb', 'worry', 'look_fwd', 'down', 'enthus', 'life_mean',
         'scared'],
        dtype='object')),
                                                   ('nominal columns',
                                                    OneHotEncoder(),
                                                    Index(['employed', 'h_disab', 'ment_cond'], dtype='object')),
                                                   ('numerical columns',
                                                    StandardScaler(),
                                                    Index(['age', 'stud_hr'], dtype='object'

In [168]:
cv_score

{'fit_time': array([0.03355455, 0.03219652, 0.03155661, 0.03323078]),
 'score_time': array([0.01652193, 0.01688838, 0.01590323, 0.0336535 ]),
 'estimator': [Pipeline(steps=[('transformer',
                   ColumnTransformer(remainder='passthrough',
                                     transformers=[('ordinal columns',
                                                    OrdinalEncoder(),
                                                    Index(['fit_hr', 'social_hr', 'wind', 'dry_mouth', 'positive', 'breath_diff',
         'initiate', 'tremb', 'worry', 'look_fwd', 'down', 'enthus', 'life_mean',
         'scared'],
        dtype='object')),
                                                   ('nominal columns',
                                                    OneHotEncoder(),
                                                    Index(['employed', 'h_disab', 'ment_cond'], dtype='object')),
                                                   ('numerical columns',
                       