# Hyperparameter tuning

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# Earliest possible date is 2017-06-17
from_date = '2017-09-01'
until_date = '2017-12-31'

local_data_folder = 'data/raw' # do not end in /
output_folder = 'data/processed' # do not end in /

download_script = './download_data.sh'

dates = list(pd.date_range(from_date, until_date, freq='D').strftime('%Y-%m-%d'))

! mkdir -p {local_data_folder}

# We found it was more reliable to generate a bash script and run it, rather than
# run the commands in a python for-loop

with open(download_script, 'w') as f:
    f.write("#!/bin/bash\n")
    f.write("\nset -euo pipefail\n")
    f.write("\n# This script was generated to download data for multiple days\n")
    for date in dates:
        success_file =  os.path.join(local_data_folder, date, 'success')

        f.write("""
if [ ! -f {success_file} ]; then

    echo "Getting PDS dataset for date {date}"        
    mkdir -p {local_data_folder}/{date}
    aws s3 sync s3://deutsche-boerse-xetra-pds/{date} {local_data_folder}/{date} --no-sign-request
    touch {success_file}            
else
    echo "PDS dataset for date {date} already exists"
fi\n""".format(success_file=success_file, date=date, local_data_folder=local_data_folder))

        
! chmod +x {download_script}     
! head -n 15 {download_script} 

#!/bin/bash

set -euo pipefail

# This script was generated to download data for multiple days

if [ ! -f data/raw/2017-09-01/success ]; then

    echo "Getting PDS dataset for date 2017-09-01"        
    mkdir -p data/raw/2017-09-01
    aws s3 sync s3://deutsche-boerse-xetra-pds/2017-09-01 data/raw/2017-09-01 --no-sign-request
    touch data/raw/2017-09-01/success            
else
    echo "PDS dataset for date 2017-09-01 already exists"
fi


In [6]:
# execute the download script to retrieve the data
!  {download_script}

PDS dataset for date 2017-09-01 already exists
PDS dataset for date 2017-09-02 already exists
PDS dataset for date 2017-09-03 already exists
PDS dataset for date 2017-09-04 already exists
PDS dataset for date 2017-09-05 already exists
PDS dataset for date 2017-09-06 already exists
PDS dataset for date 2017-09-07 already exists
PDS dataset for date 2017-09-08 already exists
PDS dataset for date 2017-09-09 already exists
PDS dataset for date 2017-09-10 already exists
PDS dataset for date 2017-09-11 already exists
PDS dataset for date 2017-09-12 already exists
PDS dataset for date 2017-09-13 already exists
PDS dataset for date 2017-09-14 already exists
PDS dataset for date 2017-09-15 already exists
PDS dataset for date 2017-09-16 already exists
PDS dataset for date 2017-09-17 already exists
PDS dataset for date 2017-09-18 already exists
PDS dataset for date 2017-09-19 already exists
PDS dataset for date 2017-09-20 already exists
PDS dataset for date 2017-09-21 already exists
PDS dataset f

In [3]:
import glob
from datetime import datetime

In [4]:
%%time
def load_csv_dirs(data_dirs):
    files = []
    for data_dir in data_dirs:
        files.extend(glob.glob(os.path.join(data_dir, '*.csv')))
    return pd.concat(map(pd.read_csv, files), sort=False)

data_dir = local_data_folder + '/'
data_subdirs = map(lambda date: data_dir + date, dates)
unprocessed_df = load_csv_dirs(data_subdirs)

CPU times: user 33.9 s, sys: 5.56 s, total: 39.5 s
Wall time: 1min 3s


In [5]:
# we want the dates to be comparable to datetime.strptime()
unprocessed_df["CalcTime"] = pd.to_datetime("1900-01-01 " + unprocessed_df["Time"])
unprocessed_df["CalcDateTime"] = pd.to_datetime(unprocessed_df["Date"] + " " + unprocessed_df["Time"])

# Filter common stock
# Filter between trading hours 08:00 and 20:00
# Exclude auctions (those are with TradeVolume == 0)
only_common_stock = unprocessed_df[unprocessed_df.SecurityType == 'Common stock']
time_fmt = "%H:%M"
opening_hours_str = "08:00"
closing_hours_str = "20:00"
opening_hours = datetime.strptime(opening_hours_str, time_fmt)
closing_hours = datetime.strptime(closing_hours_str, time_fmt)

cleaned_common_stock = only_common_stock[(only_common_stock.TradedVolume > 0) & \
                  (only_common_stock.CalcTime >= opening_hours) & \
                  (only_common_stock.CalcTime <= closing_hours)]

In [6]:
bymnemonic = cleaned_common_stock[['Mnemonic', 'TradedVolume']].groupby(['Mnemonic']).sum()
number_of_stocks = 100
top = bymnemonic.sort_values(['TradedVolume'], ascending=[0]).head(number_of_stocks)
top_k_stocks = list(top.index.values)
cleaned_common_stock = cleaned_common_stock[cleaned_common_stock.Mnemonic.isin(top_k_stocks)]
sorted_by_index = cleaned_common_stock.set_index(['Mnemonic', 'CalcDateTime']).sort_index()
non_empty_days = sorted(list(cleaned_common_stock['Date'].unique()))

In [7]:
import datetime
def build_index(non_empty_days, from_time, to_time):
    date_ranges = []
    for date in non_empty_days:
        yyyy, mm, dd = date.split('-')
        from_hour, from_min = from_time.split(':')
        to_hour, to_min = to_time.split(':')    
        t1 = datetime.datetime(int(yyyy), int(mm), int(dd), int(from_hour),int(from_min),0)
        t2 = datetime.datetime(int(yyyy), int(mm), int(dd), int(to_hour),int(to_min),0) 
        date_ranges.append(pd.DataFrame({"OrganizedDateTime": pd.date_range(t1, t2, freq='1Min').values}))
    agg = pd.concat(date_ranges, axis=0) 
    agg.index = agg["OrganizedDateTime"]
    return agg
new_datetime_index = build_index(non_empty_days, opening_hours_str, closing_hours_str)["OrganizedDateTime"].values

In [8]:
def basic_stock_features(input_df, mnemonic, new_time_index, inplace=False):
    stock = sorted_by_index.loc[mnemonic]
    if not inplace:
        stock = sorted_by_index.loc[mnemonic].copy()
    
    stock = stock.reindex(new_time_index)
    
    features = ['MinPrice', 'MaxPrice', 'EndPrice', 'StartPrice']
    for f in features:
        stock[f] = stock[f].fillna(method='ffill')   
    
    features = ['TradedVolume', 'NumberOfTrades']
    for f in features:
        stock[f] = stock[f].fillna(0.0)
        
    stock['HourOfDay'] = stock.index.hour
    stock['MinOfHour'] = stock.index.minute
    stock['MinOfDay'] = stock.index.hour*60 + stock.index.minute

    stock['DayOfWeek'] = stock.index.dayofweek
    stock['DayOfYear'] = stock.index.dayofyear
    stock['MonthOfYear'] = stock.index.month
    stock['WeekOfYear'] = stock.index.weekofyear
    
    stock['Mnemonic'] = mnemonic
    unwanted_features = ['ISIN', 'SecurityDesc', 'SecurityType', 'Currency', 'SecurityID', 'Date', 'Time', 'CalcTime']
    return stock.drop (unwanted_features, axis=1)

In [9]:
stocks = []
for stock in top_k_stocks:
    stock = basic_stock_features(sorted_by_index, stock, new_datetime_index, inplace=True)
    stocks.append(stock)
# prepared should contain the numeric features for all top k stocks,
# for all days in the interval, for which there were trades (that means excluding weekends and holidays)
# for all minutes from 08:00 until 20:00
# in minutes without trades the prices from the last available minute are carried forward
# trades are filled with zero for such minutes
# a new column called HasTrade is introduced to denote the presence of trades
prepared = pd.concat(stocks, axis=0)
prepared.Mnemonic = prepared.Mnemonic.astype('category')

In [10]:
def create_xgb_target (df):
    return df.MaxPrice.shift(-1).fillna (method='ffill')

In [11]:
def create_xgb_features (df, horizon, inplace = False):
    n_df = df
    if not inplace:
        n_df = df.copy ()
    
    for offset in range(1, horizon+1):
        min_price = n_df['MinPrice'].shift (offset).fillna(method='bfill')
        max_price = n_df['MaxPrice'].shift (offset).fillna(method='bfill')
        start_price = n_df['StartPrice'].shift (offset).fillna(method='bfill')
        end_price = n_df['EndPrice'].shift (offset).fillna(method='bfill')
        trade_vol = n_df['TradedVolume'].shift (offset).fillna(method='bfill')
        num_trades = n_df['NumberOfTrades'].shift (offset).fillna(method='bfill')
        
        n_df["h{}_MinPrice".format (offset)] = min_price
        n_df["h{}_MaxPrice".format (offset)] = max_price
        n_df["h{}_StartPrice".format (offset)] = start_price
        n_df["h{}_EndPrice".format (offset)] = end_price
        n_df["h{}_TradeVolume".format (offset)] = trade_vol
        n_df["h{}_NumberOfTrades".format (offset)] = num_trades
        
    return n_df

In [12]:
%%time
xgb_data = create_xgb_features (prepared, 5, inplace=True)
xgb_data['NextMaxPrice'] = create_xgb_target (xgb_data)

CPU times: user 1.05 s, sys: 8.48 s, total: 9.53 s
Wall time: 9.61 s


In [13]:
from sklearn.model_selection import train_test_split
train_data, validate_data = train_test_split (xgb_data, train_size=0.8, test_size=0.2, shuffle=True)

cols = list(train_data.columns.values)
cols.remove ('NextMaxPrice')
cols = ['NextMaxPrice'] + cols

train_data = pd.get_dummies (train_data[cols])
validate_data = pd.get_dummies (validate_data[cols])



In [37]:
train_output_folder = 'data/hpo/train'
validate_output_folder = 'data/hpo/validate'
! mkdir -p {train_output_folder}
! mkdir -p {validate_output_folder}
train_features.to_csv(train_output_folder + '/train.csv', header=False)
validate_features.to_csv(validate_output_folder + '/validate.csv', header=False)

In [38]:
!ls -lh {train_output_folder}

total 1.7G
-rw-r--r-- 1 825712516 1896053708 1.7G Dec 17 00:57 train.csv


In [14]:
## In case of memory constraint...
import gc
del xgb_data, unprocessed_df, only_common_stock, cleaned_common_stock
gc.collect()

21

## Using the AWS Console
1. Upload the train.csv and validate.csv to an S3 bucket in your preferred region with SageMaker available
1. 

In [1]:
import boto3

In [6]:
session = boto3.Session (region_name='eu-west-2')
s3 = boto3.client ('s3')

In [8]:
s3 = boto3.resource('s3')
bucket = s3.Bucket ('jasbarto-ml-bucket')

In [None]:
bucket.put_object('data/hpo/train/train.csv')

In [5]:
import sagemaker

ModuleNotFoundError: No module named 'sagemaker'