## Import Package

In [1]:
import statistics
import pickle
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


## Load Dataset

In [3]:
# prepare x_train, y_train, x_valid, y_valid
def prepare_dataset(name1, name2, start_year, end_year, validation_ratio, history_window):

    # intersect two stocks price on date
    x = pd.read_csv("/gdrive/MyDrive/專題/data/dataset1/{}.csv".format(name1))
    y = pd.read_csv("/gdrive/MyDrive/專題/data/dataset1/{}.csv".format(name2))
    x_y = pd.merge(left=x, right=y, how="inner", on="Date")

    # select part of dataframe in date range
    filter = x_y["Date"].str.contains(start_year)
    i = x_y[filter].index[0]
    filter = x_y["Date"].str.contains(end_year)
    j = x_y[filter].index[-1]
    x_y = x_y.iloc[i:j+1, :]

    # select close price
    x = np.array(x_y["Close_x"])
    y = np.array(x_y["Close_y"])
    spread = x/y
    print("mean of spread: ", np.mean(spread))

    # create new dataframe
    values = []
    for i in range(len(spread)-(history_window+1)+1):
        row = spread[i:i+(history_window+1)]
        row = (row - np.mean(row)) / np.std(row)
        values.append(row)
    values = np.array(values)

    np.random.shuffle(values) # shuffle row
    
    column_name = []
    for i in range(history_window):
        column_name.append("x_" + str(i+1))
    column_name.append("y")

    df = pd.DataFrame(data=values, columns=column_name)

    # split dataframe into x_train, y_train, x_valid, y_valid
    print("total length: ", len(df))
    print("training length: ", int(len(df)*(1-validation_ratio)))
    print("validation length: ", int(len(df)*validation_ratio))

    idx = int(len(df)*(1-validation_ratio))
    train_df = df.iloc[:idx, :]
    valid_df = df.iloc[idx:, :]

    x_all = df[column_name[:-1]]
    y_all = df[column_name[-1]]

    x_train = train_df[column_name[:-1]]
    y_train = train_df[[column_name[-1]]]

    x_valid = valid_df[column_name[:-1]]
    y_valid = valid_df[[column_name[-1]]]

    return x_all, y_all, x_train, y_train, x_valid, y_valid

In [16]:
x_all, y_all, x_train, y_train, x_valid, y_valid = prepare_dataset(
    name1="MA",
    name2="CMCSA",
    start_year="2011",
    end_year="2016",
    validation_ratio=0.15,
    history_window=120
)

mean of spread:  3.1538585112567152
total length:  1390
training length:  1181
validation length:  208


## Train Model

In [17]:
model = XGBRegressor(n_estimators=350, learning_rate=0.04, max_depth=3)
model.fit(x_train, y_train, 
            early_stopping_rounds=5,
            eval_set=[(x_valid, y_valid)], 
            verbose=False)
predictions = model.predict(x_valid)
mean_absolute_error(predictions, y_valid)



0.2496813848912078

In [18]:
model.fit(x_all, y_all)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.04, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=350,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

## Testing Environment

In [19]:
class Environment():

    def __init__(
        self,

        # model
        model,

        # date range
        start_year,
        end_year,

        # name of two stocks
        x_stock_name,
        y_stock_name,

        # estimation info
        estimation_win,

        # trade info
        original_pv=1000,
        transaction_cost=0.001425

    ):

        # model
        self.model = model

        # prepare testing data
        self.x_test, self.y_test = self.prepare_data(x_stock_name, y_stock_name, start_year, end_year)
        self.ratio_test = self.x_test / self.y_test
        print("Mean of Spread Ratio: ", np.mean(self.ratio_test))

        # estimation info
        self.estimation_win = estimation_win
        
        # trade info
        self.original_pv = original_pv
        self.transaction_cost = transaction_cost

    # prepare testing data
    def prepare_data(self, name1, name2, start_year, end_year):

        # intersect two stocks price on date
        x = pd.read_csv("/gdrive/MyDrive/專題/data/dataset1/{}.csv".format(name1))
        y = pd.read_csv("/gdrive/MyDrive/專題/data/dataset1/{}.csv".format(name2))
        x_y = pd.merge(left=x, right=y, how="inner", on="Date")

        # select part of dataframe in date range
        filter = x_y["Date"].str.contains(start_year)
        i = x_y[filter].index[0]
        filter = x_y["Date"].str.contains(end_year)
        j = x_y[filter].index[-1]
        x_y = x_y.iloc[i:j+1, :]

        # select date range
        x = np.array(x_y["Close_x"])
        y = np.array(x_y["Close_y"])

        return x, y

    # testing
    def testing(self):

        # info
        print("Number of day: ", len(self.x_test))

        # trade info initialization
        cash = self.original_pv
        x_count = 0
        y_count = 0

        # begin / end index
        begin_idx = self.estimation_win
        end_idx = len(self.ratio_test)
        current_idx = begin_idx

        # log history
        pv_history = []
        action_history = []
        spread_history = []

        # column name for model input
        column_name = []
        for i in range(self.estimation_win):
            column_name.append("x_" + str(i+1))

        # trading loop
        while current_idx != end_idx:

            # today info
            x_price = round(self.x_test[current_idx-1], 6)
            y_price = round(self.y_test[current_idx-1], 6)
            multiplier = round(self.ratio_test[current_idx-1], 6)
            spread_history.append(multiplier)

            # calculate pv
            pv = cash + (x_price * x_count) + (y_price * y_count)
            pv_history.append(pv)
            

            # get action
            temp_x = self.x_test[current_idx-self.estimation_win:current_idx]
            temp_y = self.y_test[current_idx-self.estimation_win:current_idx]
            temp_sp = temp_x / temp_y
            temp_sp = (temp_sp - np.mean(temp_sp)) / np.std(temp_sp)
            temp_sp = temp_sp.reshape(1, -1)
            df = pd.DataFrame(data=temp_sp, columns=column_name)
            prediction = self.model.predict(df)[0]

            if prediction >= 1:
                action = 1
            elif prediction <= -1:
                action = 0
            else:
                action = 2

            action_history.append(action)
            

            # execute action
            # buy 1 x sell ratio y
            if action == 0:

                if x_count < 0 and y_count > 0:
                    cash += x_count * x_price
                    cash -= abs(x_count * x_price) * self.transaction_cost
                    x_count = 0

                    cash += y_count * y_price
                    cash -= y_count * y_price * self.transaction_cost
                    y_count = 0
                    

                # buy 1 x
                cash -= x_price
                cash -= x_price * self.transaction_cost 
                x_count += 1

                # sell ratio y
                cash += multiplier * y_price
                cash -= (multiplier * y_price) * self.transaction_cost
                y_count -= multiplier

            # sell 1 x buy ratio y
            elif action == 1:

                if x_count > 0 and y_count < 0:
                    cash += x_count * x_price
                    cash -= x_count * x_price * self.transaction_cost
                    x_count = 0

                    cash += y_count * y_price
                    cash -= abs(y_count * y_price) * self.transaction_cost
                    y_count = 0

                # sell 1 x
                cash += x_price
                cash -= x_price * self.transaction_cost
                x_count -= 1

                # buy ratio y
                cash -= multiplier * y_price
                cash -= (multiplier * y_price) * self.transaction_cost
                y_count += multiplier

            
            current_idx += 1
    

        return pv_history, action_history, spread_history

# Test

In [21]:
env = Environment(
    # model
    model=model,

    # date range
    start_year="2017",
    end_year="2017",

    # name of two stocks
    x_stock_name="MA",
    y_stock_name="CMCSA",

    # estimation / episode info
    estimation_win=120,

    # trade info
    original_pv=1000,
    transaction_cost=0.001425
)

Mean of Spread Ratio:  3.5301483357304857


In [22]:
pv_history, action_history, spread_history = env.testing()

Number of day:  251


In [23]:
def pv_sharpe_ratio(pv):
    changePerDay = []
    lastValue = 1
    for value in pv:
        changePerDay.append((value-lastValue)/lastValue)
        lastValue = value

    changePerDay.pop(0)
    avgValue = sum(changePerDay) / len(changePerDay)
    std = statistics.stdev(changePerDay)

    # convert into annual sharpe ration
    return (avgValue/std) * (len(pv) ** 0.5)

def pv_mdd(pv):
    arr = np.array(pv)
    argmin = arr.argmin()
    previousWin = arr[0:argmin]
    argmax = previousWin.argmax()
    return (pv[argmax] - pv[argmin]) / pv[argmax]

def pv_return(pv):
    return (pv[-1] - pv[0]) / pv[0]

In [24]:
pv_sharpe_ratio(pv_history)

1.1543497140049666

In [25]:
pv_mdd(pv_history)

2.441231250921115

In [26]:
pv_return(pv_history)

-0.6482945979558026

In [27]:
with open('xgboost.pickle', 'wb') as f:
    pickle.dump(pv_history, f)