In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split

# Read files into dataframe
train_df = pd.read_csv("./data/health-diagnostics-train.csv", na_values=["#NULL!"])
test_df = pd.read_csv("./data/health-diagnostics-test.csv", na_values=["#NULL!"])

# Change nulls to arbitary number and change dtype to integers
def replace_nulls(df):
    for f in df.columns:
        if df[f].dtype != "int":
            df[f].fillna(-999, inplace=True)    # xgboost can handle missing values
            df[f] = df[f].astype(int)

replace_nulls(test_df)
replace_nulls(train_df)
            
# Concat train and test with keys and create dummies
temp_df = pd.get_dummies(pd.concat([train_df.iloc[:,:-1], test_df], keys=[0,1], sort=False), 
                         columns=['env', 'lifestyle'], drop_first=True)

# Selecting data from multi index 
train_dummies, test_dummies = temp_df.xs(0).copy(), temp_df.xs(1).copy()

# Add target to end of train dummies df
train_dummies['target'] = train_df['target']

# Split features and target into X and y
X, y = train_dummies.iloc[:,:-1], train_dummies.iloc[:,-1]

# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Convert train_df into Dmatrix
train_set_dmatrix = xgb.DMatrix(X, label=y)

# Convert X_train and y_train into Dmatrix
train_dmatrix = xgb.DMatrix(X_train, label=y_train)

# Scale pos weight is sum of neg divided by sum of pos
weight = (sum(train_df['target'] == 0)) / (sum(train_df['target'] == 1))

# # Setting kfolds for cv
kfold = StratifiedKFold(n_splits=12, shuffle=True, random_state=42)

# Hyperparameters - find max depth and min child weight first as they have higher impact
cv_params = {"max_depth": [5, 6, 7],            # num of nodes from root to farthest leaf (~3-10)
             "min_child_weight": [1, 2, 3]}     # min sum of weights in order to create new node
ind_params = {"learning_rate": 0.01,            # step size shrinkage to prevent overfitting (~0-1)
              "n_estimators": 90,               # number of trees
              "seed": 42,                       # for reproducibility
              "subsample": 0.6,                 # percentage of samples per tree (~0.5-9)
              "colsample_bytree": 0.6,          # percerntage of features per tree (~0.5-9)
              "objective": "binary:logistic",   # returns predicted probability
              "scale_pos_weight": weight,       # high class imbalance
              "missing": -999}                  # handle missing values

# GridSearch evaluates a model with varying parameters to find the best possible combination
opt_gbm = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                       cv_params, 
                       scoring="f1",            # for imbalanced data & very few positives
                       cv=kfold, 
                       n_jobs=-1)               # as many threads available in parallel

opt_gbm.fit(X_train, y_train)

In [None]:
y_pred = opt_gbm.predict(X_test)

opt_gbm.best_params_                            # finding params for next step

In [None]:
# Hyperparameters - find learning rate and subsample next
cv_params = {"learning_rate": [0.01, 0.001], 
             "subsample": [0.5, 0.6, 0.7], 
             "colsample_bytree": [0.5, 0.6, 0.7]}
ind_params = {"n_estimators": 90, 
              "seed": 42, 
              "objective": "binary:logistic", 
              "max_depth": 6,                   # chosen as best param
              "min_child_weight": 1,            # chosen as best param
              "scale_pos_weight": weight,
              "missing": -999}

opt_gbm = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                       cv_params, 
                       scoring="f1", 
                       cv=kfold, 
                       n_jobs=-1)

opt_gbm.fit(X_train, y_train)

In [None]:
y_pred = opt_gbm.predict(X_test)

opt_gbm.best_params_                            # finding params for next step

In [None]:
# A dictionary of params we have determined
our_params = {"eta": 0.001,                     # chosen as best param - aka learning rate
              "seed": 42, 
              "subsample": 0.5,                 # chosen as best param
              "colsample_bytree": 0.6,          # chosen as best param
              "objective": "binary:logistic", 
              "max_depth": 6, 
              "min_child_weight": 1, 
              "scale_pos_weight": weight,
              "missing": -999}

# CV estimates the preformance of one set of parameter on unseen data
cv_xgb = xgb.cv(our_params, 
                train_dmatrix, 
                num_boost_round=300,            # number of trees - aka n_estimators
                nfold=12,
                metrics=["error"],              # binary class error rate (0.5 threshold)
                stratified=True,
                early_stopping_rounds=30)       # finish early if it doesn't improve

# Determine final boost round
cv_xgb.tail(5)

In [None]:
cl_xgb = xgb.XGBClassifier(learning_rate=0.001, 
                           seed=42, 
                           subsample=0.5, 
                           colsample_bytree=0.6, 
                           objective="binary:logistic", 
                           max_depth=6, 
                           min_child_weight=1, 
                           scale_pos_weight=weight, 
                           n_estimators=115,      # taken from last round in prev step
                           missing=-999)

cl_xgb.fit(X_train, y_train)
y_pred = cl_xgb.predict(X_test)

# Let's see how these parameters perform against OOS data using a confusion matrix
display(pd.crosstab(y_test, y_pred, rownames=["True"], colnames=["Predicted"], margins=True))

# Accuracy
accuracy = float(np.sum(y_pred==y_test)) / y_test.shape[0]
print("accuracy: %f" % (accuracy*100))

current_xgb = xgb.train(our_params, 
                        train_dmatrix,
                        num_boost_round=115)

# Plot feature importance on current model
xgb.plot_importance(current_xgb)
plt.show();

In [None]:
# %matplotlib inline

# # Plot decision tree
# fig, ax = plt.subplots(figsize=(30, 30))
# xgb.plot_tree(final_xgb, num_trees=4, ax=ax)

# fig = plt.gcf()
# fig.set_size_inches(30, 30)
# fig.savefig('tree.png')
# plt.savefig("temp.png")
# plt.show()

In [None]:
# For our final prediction, we will train our model on the entire train dataset
final_xgb = xgb.train(our_params, 
                      train_set_dmatrix,
                      num_boost_round=115)

# Convert dataframe into Dmatrix
test_set_dmatrix = xgb.DMatrix(test_dummies)

# Using out test dataset for prediction
y_pred = final_xgb.predict(test_set_dmatrix)

# Threshold for converting probability
y_pred = np.where(y_pred > 0.5, 1, 0)

# Save results to csv for submission
results_df = pd.DataFrame({"index": test_df.index, "target": y_pred})
results_df.to_csv("./data/xgboost_feat.csv", index=False)