In [202]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import pickle


# Get Data from FastAPI

In [203]:
URL = 'http://127.0.0.1:8000/get_data'

data = requests.get(URL).json()

df = pd.DataFrame(data)

Unnamed: 0,prices,osv,sma_30,sma_5,z,time_diffs,time_last_event
0,1.07835,-0.705245,1.077967,1.078258,0,14,2
1,1.07720,-0.705245,1.077094,1.077066,1,55,57
2,1.07358,-0.705245,1.073065,1.072548,1,160,27
3,1.07200,-0.705245,1.071405,1.071736,1,63,15
4,1.07003,-0.705245,1.069878,1.069642,0,462,59
...,...,...,...,...,...,...,...
163,1.08675,-0.705245,1.086372,1.086652,0,79,13
164,1.08210,-0.705245,1.081534,1.081994,0,95,58
165,1.08362,-0.705245,1.083246,1.083546,1,199,110
166,1.07882,-0.705245,1.078110,1.078658,0,56,78


# Prepare data for training

In [204]:
df = pd.DataFrame(data)

#renme z column to label
df.rename(columns={'z': 'label'}, inplace=True)

# add z colum which is label shifted backwards. type is int. remove na'
df['z'] = df['label'].shift(1)
df['z_2'] = df['label'].shift(2)
df.dropna(inplace=True)
df['z'] = df['z'].astype(int)
df['z_2'] = df['z_2'].astype(int)

y = df['label']
X = df.drop(['label', 'osv', 'prices'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=False)

Unnamed: 0,sma_30,sma_5,time_diffs,time_last_event,z,z_2
134,1.08443,1.08318,98,31,1,0
135,1.08361,1.081104,10,8,1,1
136,1.078702,1.078992,175,94,0,1
137,1.086927,1.082334,185,41,0,0
138,1.08278,1.080408,16,4,0,0
139,1.079693,1.078918,33,5,1,0
140,1.075324,1.075726,87,13,1,1
141,1.073007,1.073506,119,73,1,1
142,1.073148,1.073552,70,86,1,1
143,1.075004,1.075296,108,39,1,1


# Fit XGBoost Classifier

In [205]:
y = df['label']
X = df.drop(['label', 'osv', 'prices'], axis=1)

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=False)

bst = XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric = 'logloss')

# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [3, 4, 5],
#     'learning_rate': [0.01, 0.1, 0.3],
#     'subsample': [0.7, 0.8, 0.9],
#     'colsample_bytree': [0.7, 0.8, 0.9],
#     'gamma': [0, 0.1, 0.2]
# }

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'gamma': [0, 0.1, 0.2]
}

grid_search = GridSearchCV(estimator=bst, param_grid=param_grid, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Define the best model

In [None]:
best_model = grid_search.best_estimator_
# Get the predicted probabilities
pred_probs = best_model.predict_proba(X_test)


# Save model

In [208]:
FILENAME = "xgboost_classifier.pkl"
pickle.dump(best_model, open(FILENAME, "wb"))