In [53]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

## Data Prep

In [4]:
df = pd.read_excel('recruiting_zeta-disease_training-data_take-home-challenge (1).xlsx')

In [5]:
df.head()

Unnamed: 0,age,weight,bmi,blood_pressure,insulin_test,liver_stress_test,cardio_stress_test,years_smoking,zeta_disease
0,54.0,189.0,27.1,80.0,0.0,1.5038,0.0,10.0,0.0
1,23.0,150.0,38.5,68.0,71.0,0.3868,55.0,2.0,0.0
2,47.0,186.0,29.9,90.0,0.0,0.2728,0.0,7.0,0.0
3,18.0,150.0,30.8,70.0,1033.0,0.6598,56.0,0.0,0.0
4,24.0,160.0,32.4,74.0,125.0,0.7608,59.0,2.0,0.0


In [8]:
df_x = df[['age','weight', 'bmi', 'blood_pressure', 'insulin_test', 'liver_stress_test', 'cardio_stress_test', 'years_smoking']]
df_y = df['zeta_disease']

In [19]:
df_x.shape

(800, 8)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(df_x,df_y,stratify=df_y, test_size=0.20)

In [31]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(640, 8)
(160, 8)
(640,)
(160,)


## Model Testing

In [32]:
# Logistic Regression

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train,y_train)
y_pred = logreg.predict(X_train)
print('Train accuracy score:',accuracy_score(y_train,y_pred))
print('Test accuracy score:', accuracy_score(y_test,logreg.predict(X_test)))

Train accuracy score: 0.790625
Test accuracy score: 0.73125


In [35]:
# K Nearest Neighbors

knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
y_pred = knn.predict(X_train)
print('Train accuracy score:',accuracy_score(y_train,y_pred))
print('Test accuracy score:',accuracy_score(y_test,knn.predict(X_test)))

Train accuracy score: 0.8171875
Test accuracy score: 0.6625


In [44]:
# Random Forest

rfc_b = RFC(n_estimators=10)
rfc_b.fit(X_train,y_train)
y_pred = rfc_b.predict(X_train)
print('Train accuracy score:',accuracy_score(y_train,y_pred))
print('Test accuracy score:', accuracy_score(y_test,rfc_b.predict(X_test)))

Train accuracy score: 0.9734375
Test accuracy score: 0.7125


In [58]:
# XGBoost

df_x = scale(df_x)

xgb_cl = xgb.XGBClassifier()
xgb_cl.fit(X_train, y_train)
y_pred = xgb_cl.predict(X_train)
print('Train accuracy score:',accuracy_score(y_train,y_pred))
print('Test accuracy score:', accuracy_score(y_test,xgb_cl.predict(X_test)))

Train accuracy score: 1.0
Test accuracy score: 0.7


In [54]:
# tune XGBoost

param_grid = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

xgb_cl = xgb.XGBClassifier(objective="binary:logistic")
grid_cv = GridSearchCV(xgb_cl, param_grid, n_jobs=-1, cv=3, scoring="roc_auc")
_ = grid_cv.fit(df_x, df_y)
grid_cv.best_score_

0.8386190655475089

In [55]:
grid_cv.best_params_

{'colsample_bytree': 0.5,
 'gamma': 1,
 'learning_rate': 0.01,
 'max_depth': 4,
 'reg_lambda': 0,
 'scale_pos_weight': 1,
 'subsample': 0.8}

In [70]:
final_cl = xgb.XGBClassifier(
    **grid_cv.best_params_,
    objective="binary:logistic"
)
final_cl.fit(X_train, y_train)
y_pred = final_cl.predict(X_train)
print('Train accuracy score:',accuracy_score(y_train,y_pred))
print('Test accuracy score:', accuracy_score(y_test,final_cl.predict(X_test)))

Train accuracy score: 0.865625
Test accuracy score: 0.75625


## Final Predictions

In [64]:
df_final = pd.read_excel('recruiting_zeta-disease_prediction-data_take-home-challenge.xlsx')
df_final = df_final.drop(columns = 'zeta_disease')
df_final.head(20)

Unnamed: 0,age,weight,bmi,blood_pressure,insulin_test,liver_stress_test,cardio_stress_test,years_smoking
0,24.0,151.0,39.5,69.0,72.0,1.3968,56.0,4.0
1,27.0,179.0,35.5,89.0,156.0,1.6608,43.0,6.0
2,34.0,147.0,26.9,76.0,74.0,1.6958,53.0,2.0
3,35.0,206.0,32.4,73.0,127.0,1.4608,61.0,6.0
4,60.0,193.0,29.8,62.0,192.0,1.7798,65.0,9.0
5,45.0,120.0,36.5,108.0,50.0,1.2978,54.0,12.0
6,20.0,139.0,38.2,61.0,77.0,1.5818,68.0,3.0
7,23.0,137.0,31.2,70.0,73.0,1.4168,59.0,7.0
8,36.0,195.0,30.5,59.0,141.0,1.4498,59.0,6.0
9,19.0,193.0,25.8,84.0,66.0,1.7938,50.0,3.0


In [76]:
knn_preds = knn.predict(df_final)
print(knn_preds)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 1.]
