# IHUB GCP Pilot training module

## 1. Get data from BigQery  
Lets install the bigquery package first

In [None]:
%pip install google-cloud-bigquery

### 1.1 Using magic cell from jupyter notebook to get data from BigQuery

In [None]:
%load_ext google.cloud.bigquery

In [None]:
%%bigquery loan
SELECT *
FROM `hsbc-9553155-ihubhk-dev.public_dataset_dev.lending_club_loan`
LIMIT 10

In [None]:
loan.head()

### 1.2 Using the old fashison way 

In [None]:
from google.cloud import bigquery
client = bigquery.Client()

In [None]:
sql = """
SELECT *
FROM `hsbc-9553155-ihubhk-dev.public_dataset_dev.lending_club_loan`
WHERE grade = 'A'
LIMIT 10
"""
df = client.query(sql).to_dataframe()
df.head()

## 2. Now lets build a simple model to predict the interest rate of the loan

### 2.1 Github the place where i forked ......

In [None]:
#!git clone https://alm-github.systems.uk.hsbc/IHUBHK/pilot_training.git

### 2.2 Xgboost modeling

In [None]:
%pip install xgboost pandas

In [None]:
import xgboost as xgb
import pandas as pd

In [None]:
sql = """
SELECT 
    loan_amnt,
    installment * 12 AS annual_loan,
    annual_inc,
    installment * 12 / annual_inc AS loan_to_income_ratio,
    int_rate

FROM `hsbc-9553155-ihubhk-dev.public_dataset_dev.lending_club_loan`
LIMIT 1000
"""
df = client.query(sql).to_dataframe()
df.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(df.iloc[:,0:4], df.int_rate, test_size=0.3)

In [None]:
D_train = xgb.DMatrix(X_train, label=Y_train/100)
D_test = xgb.DMatrix(X_test, label=Y_test/100)

In [None]:
param = {
    'eta': 0.9, 
    'max_depth': 50,  
    'objective': 'binary:logistic',
    'nthread':4,
    'eval_metric':['auc', 'ams@0']
} 

steps = 50  # The number of training iterations

In [None]:
model = xgb.train(param, D_train, steps)

### 2.3 Moment of the truth

In [None]:
import numpy as np
from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score

preds = model.predict(D_test)

#ref: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.explained_variance_score.html
#ref: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics

In [None]:
# best_preds = np.asarray([np.argmax(line) for line in preds])

print("R2 Score = {}".format(r2_score(Y_test, preds)))
print("Mean Squared Error = {}".format(mean_squared_error(Y_test, preds)))
print("Explained variance score = {}".format(explained_variance_score(Y_test, preds)))

### 2.4 Lets try more

In [None]:
from sklearn.model_selection import GridSearchCV

# Various hyper-parameters to tune
xgb1 = xgb.XGBRFRegressor()
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [0.05, 0.10, 0.15, 0.20, 0.25, 0.30], #so called `eta` value
              'max_depth': [3, 4, 5, 6, 8, 10, 12, 15],
              'min_child_weight': [1, 3, 5, 7 ],
              'silent': [0],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=False)

xgb_grid.fit(X_train, Y_train/100)

In [None]:
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

### 2.3 Lets wrap things up  
Save the model artifact using pickle

In [None]:
import pickle

In [None]:
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(xgb_grid.best_estimator_, open(filename, 'wb'))


### 2.4 Revive the model for CI/CD

In [None]:
def ml_score(data):
    loaded_model = pickle.load(open('finalized_model.sav', 'rb'))
    result = loaded_model.predict(data)
    
    return result

In [None]:
ml_score(X_test)

### 2.5 Dont forget to git push

In [None]:
#!cd /opt/jupyter/notebook/pilot_training/ && git add . && git commit -m "update" && git push

### 3.0 Reference:   
https://towardsdatascience.com/a-beginners-guide-to-xgboost-87f5d4c30ed7  
https://www.kaggle.com/jayatou/xgbregressor-with-gridsearchcv   
https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/