# xgboost

In [None]:
# Run this in Jupyter's Terminal
# pip install xgboost

In [None]:
import pandas as pd
import numpy as np

import mlflow

import xgboost as xgb
from sklearn.metrics import f1_score

from data_utils import get_train_test_split_for_stock
from config import *

# Retrieve data

In [None]:
X_train, X_test, y_train, y_test = get_train_test_split_for_stock(PATH_TO_DATA_FILE)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Set experiment

In [None]:
mlflow.set_experiment("SP_EXP_Modelling")
mlflow.xgboost.autolog()

# Run the  experiment

In [None]:
dtrain=xgb.DMatrix(X_train, y_train)
dtest=xgb.DMatrix(X_test, y_test)
threshold = 0.5

with mlflow.start_run(run_name='xgboost') as run:
    
    model=xgb.train(dtrain=dtrain,params={})
    preds = model.predict(dtest)
    y_bin = [1. if y_cont > threshold else 0. for y_cont in preds]
    f1= f1_score(y_test,y_bin)
    
    mlflow.set_tag("Model_name", "xgboost")
    mlflow.log_metric(key="testing_f1_score", value=f1)

In [None]:
f1

# Show
- conda.yaml for the model including xgboost
- feature importance

## Predict

In [None]:
import mlflow
logged_model = '/data/artifacts/3/2649944bc9b04632ba26bcca5e968112/artifacts/model'

# Load model as a PyFuncModel
loaded_model = mlflow.pyfunc.load_model(logged_model)

y_hat = loaded_model.predict(X_test)

np.where(y_hat > 0.5, 1, 0)[:10]

In [None]:
#y_hat[:8]

In [None]:
#X_test.iloc[:8]

## Predict with data with invalid signature

In [None]:
cols = ["t-{}".format(10-i) for i in range(0, 10)] + ["target"]
print(cols)

In [None]:
cols[:-1]

In [None]:
# Create empty dataframe
input_df = pd.DataFrame(columns=cols[:-1])

# One element (t-8) is string, not float
input_vector = {'t-10': 1, 't-9': 1, 't-8': "a", 't-7': 0, 't-6': 0, 't-5': 0, 't-4': 1, 't-3': 1, 't-2': 1, 't-1': 1}
input_df = input_df.append(input_vector, ignore_index = True)

input_df

In [None]:
# Inference -> result in MlflowException: Incompatible input types for column t-8. Can not safely convert object to int64.
np.where(loaded_model.predict(input_df) > 0.5, 1, 0)

## Predict with correct model signature data

In [None]:
# Create empty dataframe
input_df = pd.DataFrame(columns=cols[:-1])

# All vector elements are correct
input_vector = {'t-10': 1, 't-9': 1, 't-8': 1, 't-7': 0, 't-6': 0, 't-5': 0, 't-4': 1, 't-3': 1, 't-2': 1, 't-1': 1}
input_df = input_df.append(input_vector, ignore_index = True)

input_df

In [None]:
# Inference
np.where(loaded_model.predict(input_df) > 0.5, 1, 0)