**Importing Libraries**

In [213]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [214]:
from sklearn.model_selection import train_test_split, GridSearchCV , cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler , LabelEncoder , OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix , accuracy_score

In [215]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

**Importing dataset**

In [216]:

df=pd.read_csv('cleaned_survey2.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Age,self_employed,family_history,treatment,work_interfere,remote_work,tech_company,benefits,care_options,...,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,Cleaned_Gender,no_employees
0,0,37,No,No,Yes,Often,No,Yes,Yes,Not sure,...,No,No,Some of them,Yes,No,Maybe,Yes,No,Female,15
1,1,44,No,No,No,Rarely,No,No,Don't know,No,...,Maybe,No,No,No,No,No,Don't know,No,Male,1000
2,2,32,No,No,No,Rarely,No,Yes,No,No,...,No,No,Yes,Yes,Yes,Yes,No,No,Male,15
3,3,31,Yes,Yes,Yes,Often,No,Yes,No,Yes,...,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,Male,63
4,4,31,Yes,No,No,Never,Yes,Yes,Yes,No,...,No,No,Some of them,Yes,Yes,Yes,Don't know,No,Male,300


In [217]:
df.drop('Unnamed: 0',axis=1,inplace=True)

**Train Test Split**

In [218]:
X=df.drop('treatment',axis=1)
y=df['treatment']

**preparing for encoding data**

In [219]:
binary=['Yes','No']

In [220]:
work_int=['Often','Sometimes','Rarely','Never','Dont know']

In [221]:
bene=['Yes',"No", "Don't know"]

In [222]:
care_opt=['Yes','No','Not sure']

In [223]:
well_pro=['Yes','No',"Don't know"]
seek=['Yes','No',"Don't know"]

In [224]:
anoy_mity=['Yes','No',"Don't know"]
ment_vs_phy=['Yes','No',"Don't know"]

In [225]:
lea_ve=['Very easy','Somewhat easy','Somewhat difficult','Very difficult',"Don't know"]

In [226]:
men_hel_cons=['Yes','Maybe','No']
phy_hel_cons=['Yes','Maybe','No']

In [227]:
co_workers=['Yes','Some of them','No']
super_visor=['Yes','Some of them','No']

In [228]:
men_heal_inter=['Yes','Maybe','No']
phy_heal_inter=['Yes','Maybe','No']

In [229]:
cln_gen=['Male','Female']

In [230]:
binary_columns = ['self_employed',
 'family_history',
 'remote_work',
 'tech_company',
 'obs_consequence']


In [231]:
df.columns

Index(['Age', 'self_employed', 'family_history', 'treatment', 'work_interfere',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'Cleaned_Gender',
       'no_employees'],
      dtype='object')

In [232]:
preprocessor=ColumnTransformer(transformers=[('trf1',OrdinalEncoder(categories=[binary] * len(binary_columns)),binary_columns),
                                             ('trf2',OrdinalEncoder(categories=[work_int]),["work_interfere"]),
                                             ('trf3',OrdinalEncoder(categories=[bene]),["benefits"]),
                                             ('trf4',OrdinalEncoder(categories=[care_opt]),["care_options"]),
                                             ('trf5',OrdinalEncoder(categories=[well_pro]),["wellness_program"]),
                                             ('trf6',OrdinalEncoder(categories=[seek]),["seek_help"]),
                                             ('trf7',OrdinalEncoder(categories=[anoy_mity]),["anonymity"]),
                                             ('trf8',OrdinalEncoder(categories=[ment_vs_phy]),["mental_vs_physical"]),
                                             ('trf9',OrdinalEncoder(categories=[lea_ve]),["leave"]),
                                             ('trf10',OrdinalEncoder(categories=[men_hel_cons]),["mental_health_consequence"]),
                                             ('trf11',OrdinalEncoder(categories=[phy_hel_cons]),["phys_health_consequence"]),
                                             ('trf12',OrdinalEncoder(categories=[co_workers]),["coworkers"]),
                                             ('trf13',OrdinalEncoder(categories=[super_visor]),["supervisor"]),
                                             ('trf14',OrdinalEncoder(categories=[men_heal_inter]),["mental_health_interview"]),
                                             ('trf15',OrdinalEncoder(categories=[phy_heal_inter]),["phys_health_interview"]),
                                             ('trf16',OrdinalEncoder(categories=[cln_gen]),["Cleaned_Gender"]),
                                             ('trf17',StandardScaler(),['Age']),
                                             ('trf18',StandardScaler(),['no_employees'])

                                             ],remainder='passthrough')

In [233]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [234]:
X_train_encoded=preprocessor.fit_transform(X_train)
X_test_encoded=preprocessor.fit_transform(X_test)

**Logistic Regression**

In [235]:
logistic_model=LogisticRegression()
logistic_model.fit(X_train_encoded,y_train)
logistic_pred=logistic_model.predict(X_test_encoded)

In [236]:
accuracy_score(y_test,logistic_pred)

0.8095238095238095

In [237]:
print(classification_report(y_test,logistic_pred))

              precision    recall  f1-score   support

          No       0.85      0.76      0.80       129
         Yes       0.77      0.86      0.82       123

    accuracy                           0.81       252
   macro avg       0.81      0.81      0.81       252
weighted avg       0.81      0.81      0.81       252



In [238]:
print(confusion_matrix(y_test,logistic_pred))

[[ 98  31]
 [ 17 106]]


In [239]:

param_grid = {

    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga', 'lbfgs'],
    'max_iter': [100, 200, 500],


}


In [240]:
grid = GridSearchCV(logistic_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train_encoded, y_train)

print("Best Params:", grid.best_params_)

Best Params: {'C': 0.01, 'max_iter': 100, 'solver': 'saga'}


In [241]:
grid.best_score_

np.float64(0.8470370917688784)

**Random Forest**

In [242]:
rf_model=RandomForestClassifier(n_estimators=100,random_state=42)
rf_model.fit(X_train_encoded,y_train)
rf_pred=rf_model.predict(X_test_encoded)

In [243]:
accuracy_score(y_test,rf_pred)

0.8055555555555556

In [244]:
print(classification_report(y_test,rf_pred))

              precision    recall  f1-score   support

          No       0.84      0.76      0.80       129
         Yes       0.77      0.85      0.81       123

    accuracy                           0.81       252
   macro avg       0.81      0.81      0.81       252
weighted avg       0.81      0.81      0.81       252



In [245]:
print(confusion_matrix(y_test,rf_pred))

[[ 98  31]
 [ 18 105]]


In [246]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'max_features': ['auto', 'sqrt', 'log2']

}

In [247]:
grid = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                    cv=5, n_jobs=-1, scoring='accuracy', verbose=1)

grid.fit(X_train_encoded, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [248]:
grid.best_params_

{'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 100}

In [249]:
grid.best_score_

np.float64(0.8341461011772818)

**XG boost**

In [250]:
from xgboost import XGBClassifier

In [251]:
y_train_encoded = y_train.replace({'Yes': 1, 'No': 0})
y_test_encoded = y_test.replace({'Yes': 1, 'No': 0})

In [252]:
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_encoded, y_train_encoded)

In [253]:
xgb_pred = xgb_model.predict(X_test_encoded)

In [254]:
accuracy_score(y_test_encoded, xgb_pred)

0.7817460317460317

In [255]:
print(classification_report(y_test_encoded, xgb_pred))

              precision    recall  f1-score   support

           0       0.79      0.78      0.78       129
           1       0.77      0.79      0.78       123

    accuracy                           0.78       252
   macro avg       0.78      0.78      0.78       252
weighted avg       0.78      0.78      0.78       252



In [256]:
print(confusion_matrix(y_test_encoded, xgb_pred))

[[100  29]
 [ 26  97]]


In [257]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2]
}

In [258]:
grid = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                    cv=5, n_jobs=-1, scoring='accuracy', verbose=1)

grid.fit(X_train_encoded, y_train_encoded)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [259]:
grid.best_params_

{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}

In [260]:
grid.best_score_

np.float64(0.8401261021624551)

**SVM**

In [261]:
y_train_encoded

Unnamed: 0,treatment
243,0
514,1
966,1
199,1
270,0
...,...
1044,1
1095,1
1130,1
860,1


In [262]:
svc_model=SVC(C=0.1,gamma='scale',kernel='linear')
svc_model.fit(X_train_encoded,y_train_encoded)
svc_pred=svc_model.predict(X_test_encoded)

In [263]:
accuracy_score(y_test_encoded,svc_pred)

0.8055555555555556

In [264]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}


In [265]:
grid=GridSearchCV(svc_model,param_grid,cv=5,scoring='accuracy',n_jobs=-1)
grid.fit(X_train_encoded,y_train)

In [266]:
grid.best_params_

{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}

In [267]:
grid.best_score_

np.float64(0.8500369439929066)

In [268]:
print(classification_report(y_test_encoded,svc_pred))

              precision    recall  f1-score   support

           0       0.88      0.72      0.79       129
           1       0.75      0.89      0.82       123

    accuracy                           0.81       252
   macro avg       0.82      0.81      0.80       252
weighted avg       0.82      0.81      0.80       252



In [269]:
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('svc_model', SVC(C=0.1, gamma='scale', kernel='linear'))
])

In [270]:
full_pipeline.fit(X_train, y_train)

In [271]:
predictions=full_pipeline.predict(X_test)

In [272]:
predictions

array(['No', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes',
       'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'Yes',
       'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No',
       'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No',
       'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No',
       'Yes', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes',
       'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No',
       'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes',
       'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'Yes',
       'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No',
       'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes',
       'Yes', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No',
       'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes',
       'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No

In [273]:
import joblib

In [274]:
joblib.dump(full_pipeline,'pipeline.pkl')

['pipeline.pkl']