# Feature engineering

## Obtain the data

In [None]:
import pandas as pd
import numpy as np
import glob, os
from datetime import datetime
import statsmodels.api as sm

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

import boto3

In [None]:
def read_s3_csv (dates):
    s3 = boto3.resource('s3')
    deutsche_boerse_bucket = 'deutsche-boerse-xetra-pds'
    
    bucket = s3.Bucket(deutsche_boerse_bucket)
    
    dataframes = []
    
    for date in dates:
        csv_objects = bucket.objects.filter(Prefix=date)
        for csv_obj in csv_objects:
            csv_key = csv_obj.key
            csv_body = csv_obj.get()['Body']
            df = pd.read_csv(csv_body)
            dataframes.append(df)
    return pd.concat(dataframes)

In [None]:
# Earliest possible date is 2017-06-17
from_date = '2017-07-01'
until_date = '2017-07-31'

dates = list(pd.date_range(from_date, until_date, freq='D').strftime('%Y-%m-%d'))

unprocessed_df = read_s3_csv (dates)

## Cleanse the data

In [None]:
mpl.rcParams['figure.figsize'] = (15, 10) # use bigger graphs

In [None]:
unprocessed_df.head(2)

In [None]:
unprocessed_df.shape

In [None]:
unprocessed_df.info ()

In [None]:
unprocessed_df.count()

In [None]:
unprocessed_df.Mnemonic.value_counts()

In [None]:
# we want the dates to be comparable to datetime.strptime()
unprocessed_df["CalcTime"] = pd.to_datetime("1900-01-01 " + unprocessed_df["Time"])
unprocessed_df["CalcDateTime"] = pd.to_datetime(unprocessed_df["Date"] + " " + unprocessed_df["Time"])
unprocessed_df.head()

In [None]:
# Filter common stock
# Filter between trading hours 08:00 and 20:00
# Exclude auctions (those are with TradeVolume == 0)
only_common_stock = unprocessed_df[unprocessed_df.SecurityType == 'Common stock']
time_fmt = "%H:%M"
opening_hours_str = "08:00"
closing_hours_str = "20:00"
opening_hours = datetime.strptime(opening_hours_str, time_fmt)
closing_hours = datetime.strptime(closing_hours_str, time_fmt)

cleaned_common_stock = only_common_stock[(only_common_stock.TradedVolume > 0) & \
                  (only_common_stock.CalcTime >= opening_hours) & \
                  (only_common_stock.CalcTime <= closing_hours)]
cleaned_common_stock.head(2)

In [None]:
bymnemonic = cleaned_common_stock[['Mnemonic', 'TradedVolume']].groupby(['Mnemonic']).sum()
number_of_stocks = 100
top = bymnemonic.sort_values(['TradedVolume'], ascending=[0]).head(number_of_stocks)
top.head(10)

In [None]:
top_k_stocks = list(top.index.values)
cleaned_common_stock = cleaned_common_stock[cleaned_common_stock.Mnemonic.isin(top_k_stocks)]
cleaned_common_stock.head()

In [None]:
sorted_by_index = cleaned_common_stock.set_index(['Mnemonic', 'CalcDateTime']).sort_index()
sorted_by_index.head()

In [None]:
non_empty_days = sorted(list(cleaned_common_stock['Date'].unique()))
len(non_empty_days), non_empty_days[0:2], '...', non_empty_days[-3:-1]

In [None]:
print ("Ideal data count for any stock: {}".format (44*12*60))
print ("Observation count per mnemonic:")
cleaned_common_stock.Mnemonic.value_counts()

In [None]:
import datetime
def build_index(non_empty_days, from_time, to_time):
    date_ranges = []
    for date in non_empty_days:
        yyyy, mm, dd = date.split('-')
        from_hour, from_min = from_time.split(':')
        to_hour, to_min = to_time.split(':')    
        t1 = datetime.datetime(int(yyyy), int(mm), int(dd), int(from_hour),int(from_min),0)
        t2 = datetime.datetime(int(yyyy), int(mm), int(dd), int(to_hour),int(to_min),0) 
        date_ranges.append(pd.DataFrame({"OrganizedDateTime": pd.date_range(t1, t2, freq='1Min').values}))
    agg = pd.concat(date_ranges, axis=0) 
    agg.index = agg["OrganizedDateTime"]
    return agg
new_datetime_index = build_index(non_empty_days, opening_hours_str, closing_hours_str)["OrganizedDateTime"].values
new_datetime_index

In [None]:
def basic_stock_features(input_df, mnemonic, new_time_index):
    stock = sorted_by_index.loc[mnemonic].copy()
    
    stock = stock.reindex(new_time_index)
    
    features = ['MinPrice', 'MaxPrice', 'EndPrice', 'StartPrice']
    for f in features:
        stock[f] = stock[f].fillna(method='ffill')   
    
    features = ['TradedVolume', 'NumberOfTrades']
    for f in features:
        stock[f] = stock[f].fillna(0.0)
        
    stock['HourOfDay'] = stock.index.hour
    stock['MinOfHour'] = stock.index.minute
    stock['MinOfDay'] = stock.index.hour*60 + stock.index.minute

    stock['DayOfWeek'] = stock.index.dayofweek
    stock['DayOfYear'] = stock.index.dayofyear
    stock['MonthOfYear'] = stock.index.month
    stock['WeekOfYear'] = stock.index.weekofyear
    
    stock['Mnemonic'] = mnemonic
    unwanted_features = ['ISIN', 'SecurityDesc', 'SecurityType', 'Currency', 'SecurityID', 'Date', 'Time', 'CalcTime']
    return stock.drop (unwanted_features, axis=1)

In [None]:
s = sorted_by_index.loc['DAI'].copy ()

In [None]:
s.index.dayofyear

In [None]:
stocks = []
for stock in top_k_stocks:
    stock = basic_stock_features(sorted_by_index, stock, new_datetime_index)
    stocks.append(stock)
# prepared should contain the numeric features for all top k stocks,
# for all days in the interval, for which there were trades (that means excluding weekends and holidays)
# for all minutes from 08:00 until 20:00
# in minutes without trades the prices from the last available minute are carried forward
# trades are filled with zero for such minutes
# a new column called HasTrade is introduced to denote the presence of trades
prepared = pd.concat(stocks, axis=0)

In [None]:
prepared.Mnemonic = prepared.Mnemonic.astype('category')
prepared.Mnemonic.value_counts()

**TODO** Convert timestamp to more meaningful derived features

**TODO** Integrate with plotly for histograms, correlation matrices, etc

In [None]:
sorted_by_index.loc['DAI'].tail ()

In [None]:
prepared.tail()

In [None]:
prepared.head()

In [None]:
# We save both in csv in pickle. Generally we'd read from the pickeled format because 
# it preserves the indices, but for cases where pkl cannot be read, we also output a csv format
output_folder = 'data/processed' # do not end in /
! mkdir -p {output_folder}

prepared.to_csv(output_folder + '/cooked_v3.csv')

prepared.to_pickle(output_folder + '/cooked_v3.pkl')

In [None]:
!ls -lh {output_folder}