# Feature engineering

## Obtain the data

In [3]:
import os
import pandas as pd

In [7]:
# Earliest possible date is 2017-06-17
from_date = '2017-08-01'
until_date = '2017-09-30'

local_data_folder = 'data/raw' # do not end in /
output_folder = 'data/processed' # do not end in /

download_script = './download_data.sh'

dates = list(pd.date_range(from_date, until_date, freq='D').strftime('%Y-%m-%d'))

! mkdir -p {local_data_folder}

# We found it was more reliable to generate a bash script and run it, rather than
# run the commands in a python for-loop

with open(download_script, 'w') as f:
    f.write("#!/bin/bash\n")
    f.write("\nset -euo pipefail\n")
    f.write("\n# This script was generated to download data for multiple days\n")
    for date in dates:
        success_file =  os.path.join(local_data_folder, date, 'success')

        f.write("""
if [ ! -f {success_file} ]; then

    echo "Getting PDS dataset for date {date}"        
    mkdir -p {local_data_folder}/{date}
    aws s3 sync s3://deutsche-boerse-xetra-pds/{date} {local_data_folder}/{date} --no-sign-request
    touch {success_file}            
else
    echo "PDS dataset for date {date} already exists"
fi\n""".format(success_file=success_file, date=date, local_data_folder=local_data_folder))

        
! chmod +x {download_script}     
! head -n 15 {download_script} 

#!/bin/bash

set -euo pipefail

# This script was generated to download data for multiple days

if [ ! -f data/raw/2017-08-01/success ]; then

    echo "Getting PDS dataset for date 2017-08-01"        
    mkdir -p data/raw/2017-08-01
    aws s3 sync s3://deutsche-boerse-xetra-pds/2017-08-01 data/raw/2017-08-01 --no-sign-request
    touch data/raw/2017-08-01/success            
else
    echo "PDS dataset for date 2017-08-01 already exists"
fi


In [8]:
# execute the download script to retrieve the data
!  {download_script}

Getting PDS dataset for date 2017-08-01
download: s3://deutsche-boerse-xetra-pds/2017-08-01/2017-08-01_BINS_XETR00.csv to data/raw/2017-08-01/2017-08-01_BINS_XETR00.csv
download: s3://deutsche-boerse-xetra-pds/2017-08-01/2017-08-01_BINS_XETR01.csv to data/raw/2017-08-01/2017-08-01_BINS_XETR01.csv
download: s3://deutsche-boerse-xetra-pds/2017-08-01/2017-08-01_BINS_XETR02.csv to data/raw/2017-08-01/2017-08-01_BINS_XETR02.csv
download: s3://deutsche-boerse-xetra-pds/2017-08-01/2017-08-01_BINS_XETR03.csv to data/raw/2017-08-01/2017-08-01_BINS_XETR03.csv
download: s3://deutsche-boerse-xetra-pds/2017-08-01/2017-08-01_BINS_XETR06.csv to data/raw/2017-08-01/2017-08-01_BINS_XETR06.csv
download: s3://deutsche-boerse-xetra-pds/2017-08-01/2017-08-01_BINS_XETR05.csv to data/raw/2017-08-01/2017-08-01_BINS_XETR05.csv
download: s3://deutsche-boerse-xetra-pds/2017-08-01/2017-08-01_BINS_XETR04.csv to data/raw/2017-08-01/2017-08-01_BINS_XETR04.csv
download: s3://deutsche-boerse-xetra-pds/2017-08-01/2017-

## Cleanse the data

In [9]:
import pandas as pd
import numpy as np
import glob, os
from datetime import datetime
import statsmodels.api as sm

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
mpl.rcParams['figure.figsize'] = (15, 10) # use bigger graphs

In [11]:
def load_csv_dirs(data_dirs):
    files = []
    for data_dir in data_dirs:
        files.extend(glob.glob(os.path.join(data_dir, '*.csv')))
    return pd.concat(map(pd.read_csv, files), sort=False)

data_dir = local_data_folder + '/'
data_subdirs = map(lambda date: data_dir + date, dates)
unprocessed_df = load_csv_dirs(data_subdirs)
unprocessed_df.head(2)

Unnamed: 0,ISIN,Mnemonic,SecurityDesc,SecurityType,Currency,SecurityID,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,NumberOfTrades
0,AT0000A00XX9,P4N,POLYTEC HLDG AG INH. EO 1,Common stock,EUR,2504160,2017-08-01,14:00,15.18,15.18,15.18,15.18,90,1
1,CA0679011084,ABR,BARRICK GOLD CORP.,Common stock,EUR,2504196,2017-08-01,14:00,14.265,14.285,14.255,14.27,9832,27


In [12]:
unprocessed_df.info ()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2762277 entries, 0 to 5
Data columns (total 14 columns):
ISIN              object
Mnemonic          object
SecurityDesc      object
SecurityType      object
Currency          object
SecurityID        object
Date              object
Time              object
StartPrice        float64
MaxPrice          float64
MinPrice          float64
EndPrice          float64
TradedVolume      object
NumberOfTrades    object
dtypes: float64(4), object(10)
memory usage: 316.1+ MB


In [13]:
unprocessed_df.count()

ISIN              2762277
Mnemonic          2762277
SecurityDesc      2762277
SecurityType      2762277
Currency          2762277
SecurityID        2762277
Date              2762277
Time              2762277
StartPrice        2762277
MaxPrice          2762277
MinPrice          2762277
EndPrice          2762277
TradedVolume      2762277
NumberOfTrades    2762277
dtype: int64

In [14]:
# we want the dates to be comparable to datetime.strptime()
unprocessed_df["CalcTime"] = pd.to_datetime("1900-01-01 " + unprocessed_df["Time"])

unprocessed_df["CalcDateTime"] = pd.to_datetime(unprocessed_df["Date"] + " " + unprocessed_df["Time"])
unprocessed_df.head()

Unnamed: 0,ISIN,Mnemonic,SecurityDesc,SecurityType,Currency,SecurityID,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,NumberOfTrades,CalcTime,CalcDateTime
0,AT0000A00XX9,P4N,POLYTEC HLDG AG INH. EO 1,Common stock,EUR,2504160,2017-08-01,14:00,15.18,15.18,15.18,15.18,90,1,1900-01-01 14:00:00,2017-08-01 14:00:00
1,CA0679011084,ABR,BARRICK GOLD CORP.,Common stock,EUR,2504196,2017-08-01,14:00,14.265,14.285,14.255,14.27,9832,27,1900-01-01 14:00:00,2017-08-01 14:00:00
2,CA32076V1031,FMV,FIRST MAJESTIC SILVER,Common stock,EUR,2504197,2017-08-01,14:00,7.031,7.037,7.031,7.037,484,5,1900-01-01 14:00:00,2017-08-01 14:00:00
3,CH0012005267,NOT,"NOVARTIS NAM. SF 0,50",Common stock,EUR,2504217,2017-08-01,14:00,72.38,72.38,72.38,72.38,206,2,1900-01-01 14:00:00,2017-08-01 14:00:00
4,LU0274211480,DBXD,DB X-TRACK.DAX ETF(DR)1C,ETF,EUR,2504269,2017-08-01,14:00,119.22,119.22,119.22,119.22,218,1,1900-01-01 14:00:00,2017-08-01 14:00:00


In [15]:
# Filter common stock
# Filter between trading hours 08:00 and 20:00
# Exclude auctions (those are with TradeVolume == 0)
only_common_stock = unprocessed_df[unprocessed_df.SecurityType == 'Common stock']
time_fmt = "%H:%M"
opening_hours_str = "08:00"
closing_hours_str = "20:00"
opening_hours = datetime.strptime(opening_hours_str, time_fmt)
closing_hours = datetime.strptime(closing_hours_str, time_fmt)

cleaned_common_stock = only_common_stock[(only_common_stock.TradedVolume > 0) & \
                  (only_common_stock.CalcTime >= opening_hours) & \
                  (only_common_stock.CalcTime <= closing_hours)]
cleaned_common_stock.head(2)

Unnamed: 0,ISIN,Mnemonic,SecurityDesc,SecurityType,Currency,SecurityID,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,NumberOfTrades,CalcTime,CalcDateTime
0,AT0000A00XX9,P4N,POLYTEC HLDG AG INH. EO 1,Common stock,EUR,2504160,2017-08-01,14:00,15.18,15.18,15.18,15.18,90,1,1900-01-01 14:00:00,2017-08-01 14:00:00
1,CA0679011084,ABR,BARRICK GOLD CORP.,Common stock,EUR,2504196,2017-08-01,14:00,14.265,14.285,14.255,14.27,9832,27,1900-01-01 14:00:00,2017-08-01 14:00:00


In [16]:
bymnemonic = cleaned_common_stock[['Mnemonic', 'TradedVolume']].groupby(['Mnemonic']).sum()
number_of_stocks = 100
top = bymnemonic.sort_values(['TradedVolume'], ascending=[0]).head(number_of_stocks)
top.head(10)

Unnamed: 0_level_0,TradedVolume
Mnemonic,Unnamed: 1_level_1
DBK,350541777
EOAN,315840374
CBK,265862433
DTE,217701192
SNH,207238140
RWE,94663740
LHA,86873115
IFX,85577504
DAI,79710799
HDD,77903091


In [17]:
top_k_stocks = list(top.index.values)
cleaned_common_stock = cleaned_common_stock[cleaned_common_stock.Mnemonic.isin(top_k_stocks)]
cleaned_common_stock.head()

Unnamed: 0,ISIN,Mnemonic,SecurityDesc,SecurityType,Currency,SecurityID,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,NumberOfTrades,CalcTime,CalcDateTime
1,CA0679011084,ABR,BARRICK GOLD CORP.,Common stock,EUR,2504196,2017-08-01,14:00,14.265,14.285,14.255,14.27,9832,27,1900-01-01 14:00:00,2017-08-01 14:00:00
6,DE000A0D6554,NDX1,NORDEX SE O.N.,Common stock,EUR,2504290,2017-08-01,14:00,11.43,11.43,11.43,11.43,459,3,1900-01-01 14:00:00,2017-08-01 14:00:00
8,DE000A0HN5C6,DWNI,DEUTSCHE WOHNEN AG INH,Common stock,EUR,2504314,2017-08-01,14:00,33.975,33.975,33.925,33.945,2119,22,1900-01-01 14:00:00,2017-08-01 14:00:00
9,DE000A0JL9W6,VBK,VERBIO VER.BIOENERGIE ON,Common stock,EUR,2504343,2017-08-01,14:00,9.532,9.532,9.514,9.514,557,2,1900-01-01 14:00:00,2017-08-01 14:00:00
12,DE000A0LD2U1,AOX,ALSTRIA OFFICE REIT-AG,Common stock,EUR,2504379,2017-08-01,14:00,12.36,12.37,12.355,12.36,7085,22,1900-01-01 14:00:00,2017-08-01 14:00:00


In [18]:
sorted_by_index = cleaned_common_stock.set_index(['Mnemonic', 'CalcDateTime']).sort_index()
sorted_by_index.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ISIN,SecurityDesc,SecurityType,Currency,SecurityID,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,NumberOfTrades,CalcTime
Mnemonic,CalcDateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1COV,2017-08-01 08:00:00,DE0006062144,COVESTRO AG O.N.,Common stock,EUR,2505008,2017-08-01,08:00,66.07,66.13,66.03,66.07,883,11,1900-01-01 08:00:00
1COV,2017-08-01 08:02:00,DE0006062144,COVESTRO AG O.N.,Common stock,EUR,2505008,2017-08-01,08:02,66.08,66.13,66.08,66.13,360,4,1900-01-01 08:02:00
1COV,2017-08-01 08:03:00,DE0006062144,COVESTRO AG O.N.,Common stock,EUR,2505008,2017-08-01,08:03,66.15,66.2,66.15,66.2,408,6,1900-01-01 08:03:00
1COV,2017-08-01 08:05:00,DE0006062144,COVESTRO AG O.N.,Common stock,EUR,2505008,2017-08-01,08:05,66.14,66.14,66.14,66.14,5145,11,1900-01-01 08:05:00
1COV,2017-08-01 08:06:00,DE0006062144,COVESTRO AG O.N.,Common stock,EUR,2505008,2017-08-01,08:06,66.13,66.17,66.13,66.17,975,6,1900-01-01 08:06:00


In [19]:
non_empty_days = sorted(list(cleaned_common_stock['Date'].unique()))
len(non_empty_days), non_empty_days[0:2], '...', non_empty_days[-3:-1]

(44, ['2017-08-01', '2017-08-02'], '...', ['2017-09-27', '2017-09-28'])

In [20]:
import datetime
def build_index(non_empty_days, from_time, to_time):
    date_ranges = []
    for date in non_empty_days:
        yyyy, mm, dd = date.split('-')
        from_hour, from_min = from_time.split(':')
        to_hour, to_min = to_time.split(':')    
        t1 = datetime.datetime(int(yyyy), int(mm), int(dd), int(from_hour),int(from_min),0)
        t2 = datetime.datetime(int(yyyy), int(mm), int(dd), int(to_hour),int(to_min),0) 
        date_ranges.append(pd.DataFrame({"OrganizedDateTime": pd.date_range(t1, t2, freq='1Min').values}))
    agg = pd.concat(date_ranges, axis=0) 
    agg.index = agg["OrganizedDateTime"]
    return agg
new_datetime_index = build_index(non_empty_days, opening_hours_str, closing_hours_str)["OrganizedDateTime"].values
new_datetime_index

array(['2017-08-01T08:00:00.000000000', '2017-08-01T08:01:00.000000000',
       '2017-08-01T08:02:00.000000000', ...,
       '2017-09-29T19:58:00.000000000', '2017-09-29T19:59:00.000000000',
       '2017-09-29T20:00:00.000000000'], dtype='datetime64[ns]')

In [21]:
def basic_stock_features(input_df, mnemonic, new_time_index):
    stock = sorted_by_index.loc[mnemonic].copy()

    stock['HasTrade'] = 1.0
    
    stock = stock.reindex(new_time_index)
    
    features = ['MinPrice', 'MaxPrice', 'EndPrice', 'StartPrice']
    for f in features:
        stock[f] = stock[f].fillna(method='ffill')   
    
    features = ['HasTrade', 'TradedVolume', 'NumberOfTrades']
    for f in features:
        stock[f] = stock[f].fillna(0.0)
    
    stock['Mnemonic'] = mnemonic
    selected_features = ['Mnemonic', 'MinPrice', 'MaxPrice', 'StartPrice', 'EndPrice', 'HasTrade', 'TradedVolume', 'NumberOfTrades']
    return stock[selected_features]

In [22]:
stocks = []
for stock in top_k_stocks:
    stock = basic_stock_features(sorted_by_index, stock, new_datetime_index)
    stocks.append(stock)
# prepared should contain the numeric features for all top k stocks,
# for all days in the interval, for which there were trades (that means excluding weekends and holidays)
# for all minutes from 08:00 until 20:00
# in minutes without trades the prices from the last available minute are carried forward
# trades are filled with zero for such minutes
# a new column called HasTrade is introduced to denote the presence of trades
prepared = pd.concat(stocks, axis=0)

**TODO** Convert timestamp to more meaningful derived features

**TODO** Create target variable

**TODO** Create both time series data and MLP data

**TODO** Integrate with plotly for histograms, correlation matrices, etc

In [23]:
# We save both in csv in pickle. Generally we'd read from the pickeled format because 
# it preserves the indices, but for cases where pkl cannot be read, we also output a csv format
! mkdir -p {output_folder}
prepared.to_csv(output_folder + '/cooked_v3.csv')

prepared.to_pickle(output_folder + '/cooked_v3.pkl')

In [24]:
!ls -lh {output_folder}

total 410M
-rw-r--r-- 1 825712516 1896053708 196M Dec  1 00:13 cooked_v3.csv
-rw-r--r-- 1 825712516 1896053708 200M Dec  1 00:13 cooked_v3.pkl
