In [1]:
import pandas as pd
import datetime as dt
import pickle
import random

import yfinance as yf
from pandas_datareader import data as pdr
yf.pdr_override()
import talib as ta
from talib import abstract

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, TimeSeriesSplit

In [2]:
class ModelTrain:
    """
    Model Train methods
    """
    
    def __init__(self, model, data):
        self.model = model
        self.data = data
        self.x_train = None
        self.y_train = None
        self.preprocess = None
        
    def create_features(self, window=3):
        df = self.data.copy()

        #Bollinger Bands
        df['upper_band'], df['middle_band'], df['lower_band'] = ta.BBANDS(df['Close'], timeperiod =20)

        #Stochastic Oscillators
        df['slowk'], df['slowd'] = ta.STOCH(df['High'], df['Low'], df['Close'], fastk_period=14, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)

        #RSI
        df['rsi_14'] = ta.RSI(df['Close'], 14)

        #Absolute Price Osicillator
        df["apo"] = ta.APO(df['Close'], fastperiod=12, slowperiod=26, matype=0)

        #Exponential Moving Average
        df["ema_7"] = ta.EMA(df['Close'], timeperiod=7)

        #ratios
        df["ema_midband"] = df["middle_band"] / df["ema_7"]
        df["close_ema"] = df["Close"] / df["ema_7"]
        df["close_upband"] = df["Close"] / df["upper_band"]
        df["close_lowband"] = df["Close"] / df["lower_band"]
        df["close_midband"] = df["Close"] / df["middle_band"]
        df["slowk_slowd"] = df['slowk'] / df['slowd']
        df["rsi_pct_change"] = df["rsi_14"].pct_change()
        df["rsi_stoch"] = df["rsi_14"] / ((df['slowk'] + df['slowd'])/2)

        df["close_open"] = df["Close"] - df["Open"]
        df["close_high"] = df["Close"] - df["High"]
        df["close_low"] = df["Close"] - df["Low"]

        for i in range(1, window+1):
            for col in df.loc[:, "upper_band":"close_low"].columns:
                df[col+ "_lag_" + str(i)] = df[col].shift(i)  
        
        df.drop(columns=["Close", "Open", "High", "Low"], inplace=True)
        
        attr_num = list(set(df.columns) - set(["open", "low", "high", "close", "Target"]))
        
        num_pipeline = Pipeline([
                       ('scale', StandardScaler())
                       ])

        preprocessing_pipeline = ColumnTransformer([
                                 ('num', num_pipeline, attr_num)
                                 ])

        new_predictors = list(set(df.columns) - set(["Target"]))
        
        self.data = df
        self.x_train = preprocessing_pipeline.fit_transform(df[new_predictors])
        self.y_train = df["Target"]
        self.preprocess = preprocessing_pipeline
    
    def evaluate(self, n_splits=10, gap=0, scoring="accuracy"):
        print("Time Series Validation in progress!!")
        kf = TimeSeriesSplit(n_splits=n_splits, gap=gap)
        cv_results = cross_val_score(self.model, self.x_train, self.y_train, cv=kf, scoring= scoring)
        print(f" Accuracy Score : {round(cv_results.mean(), 5)} +- ({round(cv_results.std(), 5)}) {cv_results}")
        
    def fit(self):
        self.model.fit(self.x_train, self.y_train)
        
    def model_save(self, output_file):
        with open(output_file, 'wb') as f_out: #wb write and binary
            pickle.dump((self.preprocess, self.model), f_out)
            
    def get_xdata(self):
        return self.data

In [3]:
#fetch data from yahoo finance
def get_data(stocks, start, end, interval="1d"):
    """
    fetch data from yahoo finance and drop unnessary columns
    """
    data = pdr.get_data_yahoo(stocks, start=start, end=end, interval=interval)
    cleaned = data.drop(columns=["Volume", "Adj Close"])
    cleaned = cleaned.fillna(method= 'ffill')
    cleaned.sort_index(inplace=True) 
    return cleaned.reset_index()

def create_target(df, class_label =2, auto=True, upper=None, lower=None, quantile=0.75):
    """ 
    Create classification label based on quantiles
    """
    def map_target(pip):
        if pip > upper:
            return "buy"
        elif pip < lower:
            return "sell"
        else:
            return "hold"
        
    def up_down(pip):
        if pip > 0:
            return "buy"
        else:
            return "sell"
        
    row_random = random.choice([i for i in range(0, len(df))])
    check_decimals = len(str(df.loc[row_random, "Close"]).split(".")[1])
    
    #currency and commodaties have different decimal places for calculating pips
    if check_decimals == 2:
        pip_multiplier = 10
    else:
        pip_multiplier = 10**4
    
    df["pips"] = df["Close"].diff().shift(-1) * pip_multiplier
    df.dropna(inplace=True)
    
    if auto == True:
        upper = df["pips"].quantile(quantile)
        lower = df["pips"].quantile(1-quantile)
    
    if class_label == 2:
        df["Target"] = df["pips"].apply(up_down)
    else: 
        df["Target"] = df["pips"].apply(map_target)
        print(f"upper: {upper}")
        print(f"lower: {lower}")
    
    df.drop(columns="pips", inplace=True)
    return df.set_index("Date")

## Data Loading

In [4]:
start = dt.datetime(2004, 1, 1)
end = dt.date.today()
print(f"Start Date:{start}")
print(f"End Date:{end}")

Start Date:2004-01-01 00:00:00
End Date:2022-10-09


In [121]:
end.weekday()

6

In [13]:
myr = get_data("GBPMYR=X", start=start, end=end, interval='1d')
usd = get_data("GBPUSD=X", start=start, end=end, interval='1d')
usd = create_target(usd)
myr = create_target(myr)
currencies = [myr, usd]

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [8]:
le = LabelEncoder() #re-classing the target
usd["Target"] = le.fit_transform(usd["Target"])

mapping = {}
for each_class in le.classes_:
    mapping[int(le.transform([each_class]))] = each_class

mapping

{0: 'buy', 1: 'sell'}

In [25]:
print(str(usd.iloc[-1].name).split()[0])

2022-10-07


In [15]:
str("gelo" +"h")

'geloh'

### GBPUSD

In [67]:
model_xgb = XGBClassifier(random_state= 42, verbosity = 0, use_label_encoder=False)
model_lgbm = LGBMClassifier(random_state= 42)

xgb_usd = ModelTrain(model_xgb, usd)
xgb_usd.create_features()
xgb_usd.evaluate()

Time Series Validation in progress!!
 Accuracy Score : 0.66072 +- (0.11492) [0.52370203 0.4717833  0.48984199 0.64785553 0.7268623  0.69300226
 0.75620767 0.78329571 0.75846501 0.75620767]


In [73]:
lgbm_usd = ModelTrain(model_lgbm, usd)
lgbm_usd.create_features()
lgbm_usd.evaluate()

Time Series Validation in progress!!
 Accuracy Score : 0.66366 +- (0.13175) [0.52370203 0.42889391 0.5282167  0.55530474 0.75620767 0.74040632
 0.72234763 0.79683973 0.78781038 0.79683973]


In [82]:
lgbm_usd.fit()
lgbm_usd.model_save("model/lgbm_usd.bin")

### GBPMYR

In [88]:
myr["Target"] = le.transform(myr["Target"])

xgb_myr = ModelTrain(model_xgb, myr)
xgb_myr.create_features()
xgb_myr.evaluate()

Time Series Validation in progress!!
 Accuracy Score : 0.66599 +- (0.10287) [0.51473923 0.51927438 0.51020408 0.67573696 0.75283447 0.75963719
 0.72562358 0.75283447 0.76190476 0.68707483]


In [89]:
lgbm_myr = ModelTrain(model_lgbm, myr)
lgbm_myr.create_features()
lgbm_myr.evaluate()

Time Series Validation in progress!!
 Accuracy Score : 0.67256 +- (0.1021) [0.53968254 0.52154195 0.52154195 0.63038549 0.7414966  0.72562358
 0.75510204 0.76643991 0.76643991 0.75736961]


In [90]:
lgbm_myr.fit()
lgbm_myr.model_save("model/lgbm_myr.bin")

### Model Load

In [10]:
model_lgbm = LGBMClassifier(random_state= 42)
input_file_usd = f'model/lgbm_usd.bin'

with open(input_file_usd, "rb") as f_in:
    preprocess_usd, model_usd = pickle.load(f_in)

lgbm_usd = ModelTrain(model_lgbm, usd)
lgbm_usd.create_features()

lgbm_data = lgbm_usd.get_xdata()
lgbm_data = pd.DataFrame(lgbm_data.iloc[-1,:]).T


In [12]:
preprocess_usd(lgbm_data)

TypeError: 'ColumnTransformer' object is not callable

In [11]:
lgbm_data

Unnamed: 0,Target,upper_band,middle_band,lower_band,slowk,slowd,rsi_14,apo,ema_7,ema_midband,...,close_ema_lag_3,close_upband_lag_3,close_lowband_lag_3,close_midband_lag_3,slowk_slowd_lag_3,rsi_pct_change_lag_3,rsi_stoch_lag_3,close_open_lag_3,close_high_lag_3,close_low_lag_3
2022-10-07,1.0,1.184497,1.127515,1.070532,82.642156,76.817133,43.962496,-0.022783,1.119416,1.007235,...,1.007996,0.936162,1.039709,0.985222,1.287761,-0.021309,0.904543,0.000409,-0.019177,0.005262


In [117]:
X = lgbm_p.transform(pd.DataFrame(X[1:]).T)
lgbm_only.predict(X)

array([1])

In [118]:
y = lgbm_only.predict(X)
y[0]

1