In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics

%matplotlib inline

# Read files into dataframe
train_df = pd.read_csv("./data/health-diagnostics-train.csv", na_values=["#NULL!"])
test_df = pd.read_csv("./data/health-diagnostics-test.csv", na_values=["#NULL!"])
# test_sample = pd.read_csv("./data/health-diagnostics-test-sample-solution.csv")

feature_cols = ['income', 'maternal', 'fam-history', 'mat-illness-past', 'suppl', 'mat-illness', 'meds', 'env', 'lifestyle']

# Replace missing values with mode and change objects to integers
def replace_nulls(df):
    for f in feature_cols:
        df[f] = df[f].astype(float)
        df[f].fillna(df[f].mode()[0], inplace=True)
        df[f] = df[f].astype(int)

replace_nulls(test_df)
replace_nulls(train_df)

**Domain:**
- income - an annual per capita income of a patient
- maternal - maternal delivery age
- fam-history - a family history
- mat-illness-past - a previous maternal illness history
- suppl - nutrition and folic acid supplementation
- mat-illness - a maternal illness
- meds - medication use
- env - an environmental exposures of risk factors
- lifestyle - an unhealthy lifestyle
- target - a congenital disorder

In [None]:
# train_df.shape
# train_df.dtypes
# train_df.isnull().sum()
# train_df.describe()

In [None]:
# for f in feature_cols:
# #     display(f, train_df[f].value_counts())
#     train_df[f].sort_values().value_counts(sort=False).plot(kind="bar")
#     plt.show();

- income
    - even distribution
- maternal
    - large mode of 0
- fam-history
    - large mode of 0
- mat-illness-past
    - large mode of 0
- suppl
    - left skewed
- mat-illness
- meds
- env
    - has a distribution
- lifestyle
    - has a distribution, may need to create dummies

In [None]:
train_df['target'].value_counts()

- shows binary classification (0 or 1), so requires logistic regression
- shows there is a class imbalance, very few 1's

In [None]:
# Find train to test ratio
train_df.shape[0] / (train_df.shape[0]+test_df.shape[0])

# Find percentage of 1's
round(train_df['target'].sum() / train_df['target'].count() * 100, 2)

- shows there is a 90 : 10 split between train : test
- therefore, probably very few true predictions
- there is there is 0.18% chance of target 1 for every row

In [None]:
sns.heatmap(train_df.corr())

In [None]:
target_mask = train_df['target'] == 1

feature_cols = ['income', 'maternal', 'fam-history', 'mat-illness-past', 'suppl', 'mat-illness', 'meds', 'env', 'lifestyle']

# Find percentage of 1 occuring out of total for each column each category
for f in feature_cols:
    display(f, train_df[target_mask][f].value_counts(sort=False), train_df[f].value_counts(sort=False),
            train_df[target_mask][f].value_counts(sort=False) / train_df[f].value_counts(sort=False) * 100)

In [None]:
# def train_test_rmse(df, feature_cols):
#     X = df[feature_cols]
#     y = df.target
    
#     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
#     linreg = LinearRegression()
#     linreg.fit(X_train, y_train)
    
#     y_pred = linreg.predict(X_test)
#     return np.sqrt(metrics.mean_squared_error(y_test, y_pred))

# print(train_test_rmse(train_df, feature_cols, ['target']))

**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

feature_cols = ['income', 'maternal', 'fam-history', 'mat-illness-past', 'suppl', 'mat-illness', 'meds', 'env', 'lifestyle']

X = train_df[feature_cols]
y = train_df.target

# Split X and y into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

# Displaying confusion matrix
display(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))
recall_metric = round(100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]),0)
print("Recall metric in the train dataset: {0}%".format(recall_metric))

# Finding the f1 score
display(metrics.f1_score(y_test, y_pred, average="binary"))

# Predicting on test data
test_df['target'] = logreg.predict(test_df[feature_cols])

# Saving to csv for submission
test_df['target'].to_csv("./data/logreg.csv", index_label="index", header="target")

- 0.58 AUC

**Applying feature engineering**

In [None]:
from sklearn.linear_model import LogisticRegression

# Feature engineering
train_df['low_income'] = train_df['income'] == 1  # higher chance of target
train_df['low_suppl'] = train_df['suppl'] < 2  # higher chance of target
train_df['mat_ill_cat'] = train_df['mat-illness'] == 2   # higher chance of target
lx_dummies = pd.get_dummies(train_df['lifestyle'], prefix="lx")
lx_dummies.drop(lx_dummies.columns[0], axis=1, inplace=True)
train_df = pd.concat([train_df, lx_dummies], axis=1)

feature_cols = ['low_income', 'maternal', 'fam-history', 'mat-illness-past', 'low_suppl', 'mat_ill_cat', 
                'meds', 'lx_1', 'lx_2', 'lx_3', 'lx_4', 'lx_5', 'lx_6', 'lx_7', 'lx_8']

X = train_df[feature_cols]
y = train_df.target

# Split X and y into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

# Displaying confusion matrix
display(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))
recall_metric = round(100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]),0)
print("Recall metric in the train dataset: {0}%".format(recall_metric))

metrics.f1_score(y_test, y_pred, average="binary")

- no improvement

binary classification with unbalanced data

https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-14-106
https://www.marcoaltini.com/blog/dealing-with-imbalanced-data-undersampling-oversampling-and-proper-cross-validation

**K-nearest neighbour**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

feature_cols = ['income', 'maternal', 'fam-history', 'mat-illness-past', 'suppl', 'mat-illness', 'meds', 'env', 'lifestyle']

X = train_df[feature_cols]
y = train_df.target

# Split X and y into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

# Displaying confusion matrix
display(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))
# recall_metric = round(100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]),0)
# print("Recall metric in the train dataset: {0}%".format(recall_metric))

metrics.f1_score(y_test, y_pred, average="binary")

# scores = []

# for k in range(1,10):
#     knn = KNeighborsClassifier(n_neighbors=k)
#     knn.fit(X_train, y_train)
#     y_pred = knn.predict(X_test)
#     scores.append([k, metrics.f1_score(y_test, y_pred, average="binary")])

# print(scores)

**Syntheic Minority Over-sampling Technique**

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score, classification_report

feature_cols = ['income', 'maternal', 'fam-history', 'mat-illness-past', 'suppl', 'mat-illness', 'meds', 'env', 'lifestyle']

X = train_df[feature_cols]
y = train_df.target

# Split X and y into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)
print("Before oversampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before oversampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

print('After oversampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After oversampling, the shape of train_y: {} \n'.format(y_train_res.shape))
print("After oversampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After oversampling, counts of label '0': {}".format(sum(y_train_res==0)))

parameters = {
    'C': np.linspace(1, 10, 10)
             }
lr = LogisticRegression()
clf = GridSearchCV(lr, parameters, cv=5, verbose=5, n_jobs=3)
clf.fit(X_train_res, y_train_res.ravel())

clf.best_params_

In [None]:
lr1 = LogisticRegression(C=2, penalty='l1', verbose=5)
lr1.fit(X_train_res, y_train_res.ravel())

In [None]:
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

y_train_pre = lr1.predict(X_train)

cnf_matrix_tra = confusion_matrix(y_train, y_train_pre)

print("Recall metric in the train dataset: {}%".format(100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])))

class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix_tra , classes=class_names, title='Confusion matrix')
plt.show()

In [None]:
y_pre = lr1.predict(X_test)

cnf_matrix = confusion_matrix(y_test, y_pre)

print("Recall metric in the testing dataset: {}%".format(100*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])))
#print("Precision metric in the testing dataset: {}%".format(100*cnf_matrix[0,0]/(cnf_matrix[0,0]+cnf_matrix[1,0])))
# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix , classes=class_names, title='Confusion matrix')
plt.show()

In [None]:
tmp = lr1.fit(X_train_res, y_train_res.ravel())

In [None]:
y_pred_sample_score = tmp.decision_function(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_pred_sample_score)

roc_auc = auc(fpr,tpr)

# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
lr1.fit(X_train_res, y_train_res.ravel())

# Predicting on test data
test_df['target'] = lr1.predict(test_df[feature_cols])

# Saving to csv for submission
test_df['target'].to_csv("./data/logreg_smote.csv", index_label="index", header="target")

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

feature_cols = ['income', 'maternal', 'fam-history', 'mat-illness-past', 'suppl', 'mat-illness', 'meds', 'env', 'lifestyle']

X = train_df[feature_cols]
y = train_df.target

# Split X and y into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)
print("Before oversampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before oversampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

print('After oversampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After oversampling, the shape of train_y: {} \n'.format(y_train_res.shape))
print("After oversampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After oversampling, counts of label '0': {}".format(sum(y_train_res==0)))

parameters = {
    'C': np.linspace(1, 10, 10)
             }
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

clf = GridSearchCV(lr, parameters, cv=5, verbose=5, n_jobs=3)
clf.fit(X_train_res, y_train_res.ravel())

clf.best_params_

**XGBoost**

In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split

# Read files into dataframe
train_df = pd.read_csv("./data/health-diagnostics-train.csv", na_values=["#NULL!"])
test_df = pd.read_csv("./data/health-diagnostics-test.csv", na_values=["#NULL!"])

# Change nulls to arbitary number and change dtype to integers
def replace_nulls(df):
    for f in df.columns:
        if df[f].dtype != "int":
            df[f].fillna(-999, inplace=True)    # instead of changing to mode, xgboost can handle missing values
            df[f] = df[f].astype(int)

replace_nulls(test_df)
replace_nulls(train_df)

# Split features and target into X and y
X, y = train_df.iloc[:,:-1].values, train_df.iloc[:,-1].values

# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Convert train_df into Dmatrix
train_set_dmatrix = xgb.DMatrix(X, label=y)

# Convert X_train and y_train into Dmatrix
train_dmatrix = xgb.DMatrix(X_train, label=y_train)

# Scale pos weight is sum of neg divided by sum of pos
weight = (sum(train_df['target'] == 0)) / (sum(train_df['target'] == 1))

# # Setting kfolds for cv
# kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# Hyperparameters - find max depth and min child weight first as they have higher impact
cv_params = {"max_depth": [3, 5, 7],            # how deep each tree is can grow during any boosting round (~3-10)
             "min_child_weight": [1, 3, 5]}     # minimum sum of weights (smaller for highly imbalanced class)
ind_params = {"learning_rate": 0.1,             # step size shrinkage to prevent overfitting (~0-1)
              "n_estimators": 1000,              # number of trees
              "seed": 0,                        # for reproducibility
              "subsample": 0.8,                 # percentage of samples per tree (~0.5-9)
              "colsample_bytree": 0.8,          # percerntage of features per tree (~0.5-9)
              "objective": "binary:logistic",   # returns predicted probability (or "reg: logistic")
              "scale_pos_weight": weight,       # high class imbalance
              "missing": -999}                  # xgb can handle missing values

# GridSearch evaluates a model with varying parameters to find the best possible combination
opt_gbm = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                       cv_params, 
                       scoring="f1",            # f1 is good in highly imbalanced data with very few true values
                       cv=5, 
                       n_jobs=-1) # use as many threads as possible to build trees in parallel

opt_gbm.fit(X_train, y_train)

In [None]:
y_pred = opt_gbm.predict(X_test)

opt_gbm.best_params_                            # finding params for next step

In [None]:
# Hyperparameters - find learning rate and subsample next
cv_params = {"learning_rate": [0.1, 0.01], 
             "subsample": [0.7, 0.8, 0.9]}
ind_params = {"n_estimators": 1000, 
              "seed": 0, 
              "colsample_bytree": 0.8, 
              "objective": "binary:logistic", 
              "max_depth": 5,                   # chosen as best param
              "min_child_weight": 5,            # chosen as best param
              "scale_pos_weight": weight,
              "missing": -999}

opt_gbm = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                       cv_params, 
                       scoring="f1", 
                       cv=5, 
                       n_jobs=-1)

opt_gbm.fit(X_train, y_train)

In [None]:
# # Hyperparameters - find learning rate and subsample next
# cv_params = {"subsample": [0.6, 0.7, 0.8, 0.9], 
#              "colsample_bytree": [0.6, 0.7, 0.8, 0.9]}
# ind_params = {"n_estimators": 100, 
#               "seed": 0, 
#               "colsample_bytree": 0.8, 
#               "objective": "binary:logistic", 
#               "max_depth": 3,                   # chosen as best param
#               "min_child_weight": 3,            # chosen as best param
#               "scale_pos_weight": weight,
#               "missing": -999}

# opt_gbm = GridSearchCV(xgb.XGBClassifier(**ind_params), 
#                        cv_params, 
#                        scoring="f1", 
#                        cv=kfold, 
#                        n_jobs=-1)

# opt_gbm.fit(X_train, y_train)

In [None]:
y_pred = opt_gbm.predict(X_test)

opt_gbm.best_params_                            # finding params for next step

In [None]:
our_params = {"eta": 0.01,                      # chosen as best param - aka learning rate
              "seed": 0, 
              "subsample": 0.7,                 # chosen as best param
              "colsample_bytree": 0.7,          # chosen as best param
              "objective": "binary:logistic", 
              "max_depth": 5, 
              "min_child_weight": 5, 
              "scale_pos_weight": weight,
              "missing": -999}

# CV estimates the preformance of one set of parameter on unseen data
cv_xgb = xgb.cv(our_params, 
                train_dmatrix, 
                num_boost_round=1000,           # number of trees - aka n_estimators
                nfold=5,
                metrics=["error"],              # binary classification error rate (0.5 threshold)
                stratified=True,
                early_stopping_rounds=100)      # finish early if it does not improve for n rounds

In [None]:
# Determine final boost round
cv_xgb.tail(5)

In [None]:
xg_cl = xgb.XGBClassifier(learning_rate=0.01, 
                          seed=0, 
                          subsample=0.7, 
                          colsample_bytree=0.7, 
                          objective="binary:logistic", 
                          max_depth=5, 
                          min_child_weight=5, 
                          scale_pos_weight=weight, 
                          n_estimators=54,      # last round from cv
                          missing=-999)

xg_cl.fit(X_train, y_train)
y_pred = xg_cl.predict(X_test)

# Let's see how these parameters perform against OOS data
# Confusion matrix
display(pd.crosstab(y_test, y_pred, rownames=["True"], colnames=["Predicted"], margins=True))

# Accuracy
accuracy = float(np.sum(y_pred==y_test)) / y_test.shape[0]
print("accuracy: %f" % (accuracy*100))

final_xgb = xgb.train(our_params, 
                      train_dmatrix, 
                      num_boost_round=54)       # last round from cv

# Plot feature importance
xgb.plot_importance(final_xgb);

In [None]:
# For our test prediction, we will train our model on entire train dataset
final_xgb = xgb.train(our_params, 
                      train_dmatrix, 
                      num_boost_round=144)

# Convert dataframe into Dmatrix
test_set_dmatrix = xgb.DMatrix(test_df.values)

# Using out test dataset for prediction
y_pred = final_xgb.predict(test_set_dmatrix)

# Threshold for converting probability
y_pred = np.where(y_pred > 0.5, 1, 0)

# Save results to csv for submission
results_df = pd.DataFrame({"index": test_df.index, "target": y_pred})
results_df.to_csv("./data/xgboost.csv", index=False)

In [None]:
https://cambridgespark.com/getting-started-with-xgboost/
https://blog.cambridgespark.com/hyperparameter-tuning-in-xgboost-4ff9100a3b2f
https://machinelearningmastery.com/tune-learning-rate-for-gradient-boosting-with-xgboost-in-python/
https://towardsdatascience.com/fine-tuning-xgboost-in-python-like-a-boss-b4543ed8b1e
https://medium.com/@mateini_12893/doing-xgboost-hyper-parameter-tuning-the-smart-way-part-1-of-2-f6d255a45dde