# Chicago taxi tips Regression Pipeline local

In [None]:
from sklearn.model_selection import train_test_split

#### Create datasets compatible with xgboost model

In [None]:
def create_sets(data,label_column):
    ### get label by selecting only the label column
    label=...
    ### get features by dropping label from the input
    data=...
    ### use imported train_test_split function, use relevant test_size and fix the random state
    X_train, X_test, y_train, y_test = train_test_split...
    ### Prepare DMatrix objects to allow xgboost to train data on 
    dtrain = xgboost.DMatrix(...)
    dtest = xgboost.DMatrix(...)
    return dtrain, dtest 
    

#### Create callback to use xgboost with tensorboard

In [None]:
# install custom python client to use tensorboard with non tensorflow models
!pip install tensorboardX

In [None]:
# import tensorboard custom client dependancy
from tensorboardX import SummaryWriter

In [None]:
# This function allows to create a summary from xgboost training data
def TensorBoardCallback(training_log_path):
    writer = SummaryWriter(training_log_path)

    def callback(env):
        for k, v in env.evaluation_result_list:
            print(k,v)
            writer.add_scalar(k, v, env.iteration)

    return callback

#### Create the main train function

In [None]:
### install xgboost package
!pip install xgboost==1.1.0

In [None]:
import xgboost

In [None]:
def xgboost_train(data,label,num_iterations,training_log_path,booster_params):
    
    ### create train,test objects from the function defined upper
    dtrain, dtest = ...
    
    # Booster parameters with default values
    booster_params = booster_params or {}
    booster_params.setdefault('objective', 'reg:squarederror')
    booster_params.setdefault('booster', 'gbtree')
    booster_params.setdefault('learning_rate', 0.3)
    booster_params.setdefault('min_split_loss', 0)
    booster_params.setdefault('max_depth', 6)
    
    num_iterations = num_iterations or 20
    
    ### create blank model
    model = xgboost.train(
        params=booster_params,
        dtrain=dtrain,
        num_boost_round=num_iterations,
        evals=[(dtrain, 'train'), (dtest, 'test')],
       callbacks=[TensorBoardCallback(training_log_path)]
    )
    return model

### Test the training

##### Create minio client to get data

In [None]:
#installing dependancies : pyarrow to read & process parquet format
!pip install pyarrow

In [None]:
from minio import Minio
import urllib3
from io import BytesIO
import pandas as pd
import pyarrow
import datetime
import os

In [None]:
## Create a client with the access key and the secret key given, you can help yourself from previous technical sessions
client = Minio(
    ...
)

In [None]:
bucket_name = ''#firstname-name
object_name = 'datasets/chicago/trips.parquet'

In [None]:
# Get data from minio using get_object, decode it using BytesIO and read the parquet result with pandas, you can help yourself from previous technical sessions
try:
    ...
    # Read data from response.
    ...
    data = ...
finally:
    response.close()
    response.release_conn()

In [None]:
data.head()

In [None]:
data = data.drop(["trip_start_timestamp"],axis=1).dropna()

In [None]:
xgboost_train(
    data=...,
    label='tips',
    num_iterations=20,    
    training_log_path=f"training/fit/taxi-trips/{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}",
    booster_params={}
)

### Now connect to your tensorboard and check the loss

1. Go to the Kubeflow interface and click on 'tensorboard'

![menu_tenso](./images/menu_tenso.png)

2. click on 'new' 

![new](./images/new.png)

3. link it to your lab, where you persist your training logs

![board](./images/board.png)

4. click `connect` and you should be able to see your train metrics

![tenso](./images/tenso.png)