In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mplfinance as mpf
import talib as ta
from talib import abstract
import random

import warnings
warnings.simplefilter(action='ignore')

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, TimeSeriesSplit

In [2]:
def create_target(df, auto=True, upper=None, lower=None, quantile=0.75):
    
    def map_target(pip):
        if pip > upper:
            return "buy"
        elif pip < lower:
            return "sell"
        else:
            return "hold"
        
    def up_down(pip):
        if pip > 0:
            return "buy"
        else:
            return "sell"
        
    row_random = random.choice([i for i in range(0, len(df))])
    check_decimals = len(str(df.loc[row_random, "Close"]).split(".")[1])
    
    #currency and commodaties have different decimal places for calculating pips
    if check_decimals == 2:
        pip_multiplier = 10
    else:
        pip_multiplier = 10**4
    
    df["pips"] = df["Close"].diff().shift(-1) * pip_multiplier
    df.dropna(inplace=True)
    
    if auto == True:
        upper = df["pips"].quantile(quantile)
        lower = df["pips"].quantile(1-quantile)
    
    df["Target"] = df["pips"].apply(up_down)
    df.drop(columns="pips", inplace=True)
    
    #print(f"upper: {upper}")
    #print(f"lower: {lower}")
    return df.set_index("Date")

In [3]:
data = pd.read_csv("src/currency_exchange_updated.csv", date_parser=["Date"])
data["Date"] = pd.to_datetime(data["Date"])

#get XAUUSD data
gold = data[data["Currency"] == "XAU_USD"]
print(f"Row : {gold.shape[0]}")
gold.drop(columns="Currency",inplace=True)
gold.sort_values(by="Date", inplace=True)

gold_df = create_target(gold, quantile=0.75)

Row : 4627


In [4]:
le = LabelEncoder() #re-classing the target
gold_df["Target"] = le.fit_transform(gold_df["Target"])

mapping = {}
for each_class in le.classes_:
    mapping[int(le.transform([each_class]))] = each_class

mapping

{0: 'buy', 1: 'sell'}

## Feature Engineering

In [5]:
def create_technical_indicators(df_original, window=3):
    df = df_original.copy()
    
    #Bollinger Bands
    df['upper_band'], df['middle_band'], df['lower_band'] = ta.BBANDS(df['Close'], timeperiod =20)
    
    #Stochastic Oscillators
    df['slowk'], df['slowd'] = ta.STOCH(df['High'], df['Low'], df['Close'], fastk_period=14, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)

    #RSI
    df['rsi_14'] = ta.RSI(df['Close'], 14)

    #Absolute Price Osicillator
    df["apo"] = ta.APO(df['Close'], fastperiod=12, slowperiod=26, matype=0)
    
    #Exponential Moving Average
    df["ema_7"] = ta.EMA(df['Close'], timeperiod=7)
    
    #ratios
    df["ema_midband"] = df["middle_band"] / df["ema_7"]
    df["close_ema"] = df["Close"] / df["ema_7"]
    df["close_upband"] = df["Close"] / df["upper_band"]
    df["close_lowband"] = df["Close"] / df["lower_band"]
    df["close_midband"] = df["Close"] / df["middle_band"]
    df["slowk_slowd"] = df['slowk'] / df['slowd']
    df["rsi_pct_change"] = df["rsi_14"].pct_change()
    df["rsi_stoch"] = df["rsi_14"] / ((df['slowk'] + df['slowd'])/2)
    
    df["close_open"] = df["Close"] - df["Open"]
    
    for i in range(1, window+1):
        for col in df.loc[:, "upper_band":"close_open"].columns:
            df[col+ "_lag_" + str(i)] = df[col].shift(i)  

    return df

featured_df = create_technical_indicators(gold_df)
featured_df.dropna(inplace=True)

### Candlesticks

In [6]:
def create_candlesticks(df):
    df.rename(columns={'Open': 'open', 'High': 'high','Low': 'low','Close': 'close'}, inplace= True)
    
    #candlestick_features
    candlesticks = ta.get_function_groups()["Pattern Recognition"]
    for indicator in candlesticks:
        df[str(indicator)] = getattr(abstract, indicator)(df)

    #remove less-common candlestick
    removed = []
    for candle in candlesticks:
        non_detected = df[candle].value_counts()[0]
        if non_detected > 4000:
            removed.append(candle)
            df.drop(columns=candle, inplace=True)
    
    attr_cat = list(set(candlesticks) - set(removed))
    return df, attr_cat

#engineered_features, attr_cat = create_candlesticks(featured_df)
#new_predictors = list(set(featured_df.columns) - set(["open", "low", "high", "close", "Target"]))
#attr_num = list(set(new_predictors)-set(attr_cat))

## Model

In [6]:
## 1 - categorical pipeline
#cat_pipeline = Pipeline(steps = [
#    ('encode', OneHotEncoder(handle_unknown='ignore'))
#])

num_pipeline = Pipeline([
    ('scale', StandardScaler())
])

preprocessing_pipeline = ColumnTransformer([
    #('cat', cat_pipeline, attr_cat),
    ('num', num_pipeline, attr_num)
])

new_predictors = list(set(featured_df.columns) - set(["open", "low", "high", "close", "Target"]))

train = preprocessing_pipeline.fit_transform(featured_df[new_predictors])
y_train = featured_df["Target"]

NameError: name 'attr_num' is not defined

### Validation

In [25]:
def val_score(model, X_train, y_train, n_splits=10):
    kf = TimeSeriesSplit(n_splits=n_splits, gap=0)
    cv_results = cross_val_score(model, X_train, y_train, cv=kf, scoring= "accuracy")
    print(f" Accuracy Score : {round(cv_results.mean(), 5)} +- ({round(cv_results.std(), 5)}) {cv_results}")
    
val_score(model, train, y_train)

 Accuracy Score : 0.50512 +- (0.02469) [0.54634146 0.54146341 0.47317073 0.52682927 0.5        0.49268293
 0.4902439  0.50487805 0.47073171 0.50487805]


In [33]:
model = XGBClassifier(random_state=51, verbosity = 0)
model.fit(train, y_train)
predictions = model.predict(test)
results =pd.DataFrame({"y_true": y_test, "preds": predictions})
accuracy_score(y_test,  predictions)

0.5309734513274337

In [34]:
featured_df.tail()

Unnamed: 0_level_0,Close,Open,High,Low,Target,upper_band,middle_band,lower_band,slowk,slowd,...,ema_7_lag_3,ema_midband_lag_3,close_ema_lag_3,close_upband_lag_3,close_lowband_lag_3,close_midband_lag_3,slowk_slowd_lag_3,rsi_pct_change_lag_3,rsi_stoch_lag_3,close_open_lag_3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-09-29,1660.29,1659.88,1665.12,1641.05,1,1739.970148,1681.0135,1622.056852,27.400534,15.380393,...,1656.974801,1.019994,0.978633,0.92931,0.991611,0.95945,0.574134,-0.120122,2.386164,-23.83
2022-09-30,1659.67,1661.13,1675.57,1658.93,0,1736.347704,1678.425,1620.502296,37.876336,26.749291,...,1649.931101,1.021477,0.987193,0.933922,1.001297,0.966437,0.400974,0.112453,4.565716,6.81
2022-10-03,1699.22,1661.3,1701.8,1659.1,0,1734.78122,1677.8755,1620.96978,55.830532,40.369134,...,1652.308325,1.018456,1.004316,0.952985,1.021634,0.986116,1.663251,0.371509,3.440455,30.18
2022-10-04,1726.24,1699.4,1729.73,1694.73,1,1739.058711,1679.13,1619.201289,75.502047,56.402971,...,1654.303744,1.016146,1.003619,0.954206,1.023571,0.987672,1.781524,0.006768,1.940887,0.41
2022-10-05,1715.81,1726.42,1728.1,1700.15,0,1738.727916,1679.036,1619.344084,92.010541,74.447707,...,1655.645308,1.013759,1.002431,0.95584,1.02417,0.988826,1.415975,-0.003717,1.280054,-1.46


### Model Save

In [12]:
import pickle

output_file = f'model/xgb_model.bin'

with open(output_file, 'wb') as f_out: #wb write and binary
    pickle.dump((preprocessing_pipeline, model), f_out)

### Model Load

In [1]:
import pickle
input_file = f'model/xgb_model.bin'

with open(input_file, "rb") as f_in:
    preprocess, model = pickle.load(f_in)

In [9]:
test = featured_df[featured_df.index >= "2022-05-01"][new_predictors]
test.head()

Unnamed: 0_level_0,apo_lag_1,ema_midband_lag_2,apo_lag_2,close_open_lag_3,CDLSHORTLINE,middle_band_lag_2,slowk_lag_2,close_lowband_lag_2,slowk_slowd,ema_7_lag_3,...,ema_midband,upper_band_lag_3,middle_band_lag_1,rsi_stoch_lag_2,upper_band_lag_1,slowd_lag_3,close_ema_lag_1,slowk,lower_band_lag_1,rsi_14
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-05-02,-3.495256,1.013721,0.932179,-19.71,0,1938.9855,11.575012,1.00597,1.080105,1918.874318,...,1.019462,1993.158724,1937.5905,3.527376,1996.201075,14.162963,0.993579,14.085376,1878.979925,33.012576
2022-05-03,-9.084808,1.01516,-3.495256,8.2,0,1937.5905,13.461867,1.009271,0.919241,1912.740739,...,1.021951,1994.872989,1934.107,3.259199,2001.196316,10.572145,0.98186,12.169865,1867.017684,34.793673
2022-05-04,-15.849487,1.019462,-9.084808,1.97,0,1934.107,14.085376,0.997725,0.979506,1908.655554,...,1.02195,1996.201075,1931.32,2.434006,2004.30713,10.943022,0.988335,12.728167,1858.33287,39.430112
2022-05-05,-22.496218,1.021951,-15.849487,-33.32,0,1931.32,12.169865,1.005089,1.211505,1897.184165,...,1.021933,2001.196316,1929.1145,2.738699,2005.289737,13.040752,0.996573,16.865628,1852.939263,38.507287
2022-05-06,-26.409615,1.02195,-22.496218,4.74,0,1929.1145,12.728167,1.015257,1.251485,1889.835624,...,1.020543,2004.30713,1926.3565,3.065791,2005.813085,13.239036,0.995755,21.181504,1846.899915,40.627261


In [11]:
X = preprocess.transform(test)
model.predict(X)

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0])