# Data preperation with inclusion of external data per day

As already described, in the Data and Business Understanding phase, external data was determined for the ticker AAPL. The external data were added once per quarter and once per quarter the quarterly data were allocated to each day in each quarter. This notebook prepares the file that contains the data for each day in a quarter, so that it can be used for machine learning methods.

# Content

 1. Import dependencies
 2. Helpers
 3. Load file with action recommendations
 4. Load ticker data enriched with external data
 5. Normalization of ticker data
 6. Bringing together ticker data and action recommendations

<hr>

# 1. Import dependencies

In [1]:
import numpy as np
import pandas as pd
import datetime

# 2. Helpers

In [2]:
# Different datatypes lead to problems. To obtain the correct data types, the following function can be used.
def convertToUsefulDatatypes(df):
    for column in df:
        if column == "Date":
            df[column] = pd.to_datetime(df[column])
        else:
            df[column] = pd.to_numeric(df[column])
    return df

# 3. Load file with action recommendations

In [3]:
AAPL_labels = pd.read_csv('data/labels_train.csv', sep=',', decimal=',')
APPL_labels = convertToUsefulDatatypes(AAPL_labels)

In [4]:
AAPL_labels.shape

(2518, 836)

In [5]:
AAPL_labels.head()

Unnamed: 0,Date,A,AAN,AAP,AAPL,AAXN,ABC,ABCD,ABG,ABM,...,XEL,XOM,XPO,XRAY,XRX,XSPY,XYL,Y,YELP,ZBH
0,2008-01-02,0,1,0,-1,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2008-01-03,0,1,0,-1,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2008-01-04,0,1,0,0,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2008-01-07,0,1,0,0,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2008-01-08,0,1,0,0,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
AAPL_labels.tail()

Unnamed: 0,Date,A,AAN,AAP,AAPL,AAXN,ABC,ABCD,ABG,ABM,...,XEL,XOM,XPO,XRAY,XRX,XSPY,XYL,Y,YELP,ZBH
2513,2017-12-22,0,0,0,0,1,0,1,0,0,...,0,0,0,-1,0,1,0,0,0,0
2514,2017-12-26,0,0,0,0,1,0,1,0,0,...,0,0,0,-1,0,1,0,0,0,0
2515,2017-12-27,0,0,1,0,1,0,1,0,0,...,0,0,0,-1,0,0,0,0,0,0
2516,2017-12-28,0,1,1,0,1,0,1,0,0,...,0,0,0,-1,0,0,0,0,0,0
2517,2017-12-29,0,1,1,0,1,0,1,0,0,...,0,0,0,-1,0,-1,0,0,0,0


# 4. Load ticker data enriched with external data

In [7]:
AAPL_enriched = pd.read_csv('prepared data/AAPL_extended_per_day.csv', sep=';', decimal='.')
AAPL_enriched = convertToUsefulDatatypes(AAPL_enriched)

In [8]:
AAPL_enriched.head()

Unnamed: 0,Date,aaplopen,aaplclose,aaplvolume,ebit,revenues,net profit
0,2009-07-22,2254140000000000.0,2.23914e+16,218526000.0,11550.0,43605.0,8124.0
1,2009-07-23,2.23757e+16,2.25457e+16,131740700.0,11550.0,43605.0,8124.0
2,2009-07-24,2.24214e+16,2.28557e+16,109590600.0,11550.0,43605.0,8124.0
3,2009-07-27,2.28814e+16,2.28714e+16,108327800.0,11550.0,43605.0,8124.0
4,2009-07-28,226971000000000.0,2285710000000000.0,90888700.0,11550.0,43605.0,8124.0


In [9]:
AAPL_enriched.tail()

Unnamed: 0,Date,aaplopen,aaplclose,aaplvolume,ebit,revenues,net profit
1386,2015-01-23,1.123e+16,1129800000000000.0,46464800.0,52503.0,182795.0,39510.0
1387,2015-01-26,1.1374e+16,1131000000000000.0,55615000.0,52503.0,182795.0,39510.0
1388,2015-01-27,1.1242e+16,1.0914e+16,95568700.0,52503.0,182795.0,39510.0
1389,2015-01-28,1.1763e+16,1.1531e+16,146477100.0,52503.0,182795.0,39510.0
1390,2015-01-29,1.1632e+16,1189000000000000.0,84436400.0,52503.0,182795.0,39510.0


In [10]:
AAPL_enriched.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1391 entries, 0 to 1390
Data columns (total 7 columns):
Date          1391 non-null datetime64[ns]
aaplopen      1391 non-null float64
aaplclose     1391 non-null float64
aaplvolume    1391 non-null float64
ebit          1391 non-null float64
revenues      1391 non-null float64
net profit    1391 non-null float64
dtypes: datetime64[ns](1), float64(6)
memory usage: 76.1 KB


# 5. Normalization of ticker data

### In order to avoid inconsistencies, a normalization is subsequently performed. Date fields are excluded from normalization. The result of training and testing is achieved by approx. 10 % through normalization. 

In [11]:
def normalize(x, col_min, col_max):
    return (2.0 * x - col_max - col_min) / (col_max - col_min)

In [12]:
def normalizeList(col, col_min=None, col_max=None):
    if (col_min == None):
        col_min = min(col)
    if (col_max == None):
        col_max = max(col)
    
    return [normalize(x, col_min, col_max) for x in col]

In [13]:
def normalizeDataframe(df, min_max_df=None):
    if (min_max_df == None):
        min_max_df = pd.DataFrame(index=['min', 'max'])
        for column in df:
            if (column != 'Date'):
                col_min = min(df[column])
                col_max = max(df[column])
                min_max_df[column] = pd.Series([col_min, col_max], index = min_max_df.index)
    
    for column in df:
        if (column != 'Date'):
            norm_list = normalizeList(df[column].tolist(), min_max_df[column][0], min_max_df[column][1])
            df[column] = pd.Series(norm_list, index = df.index)
            
    return df

In [14]:
# Normalisation of AAPL_erweitert
AAPL_enriched_norm = normalizeDataframe(AAPL_enriched)
AAPL_enriched_norm.head()

Unnamed: 0,Date,aaplopen,aaplclose,aaplvolume,ebit,revenues,net profit
0,2009-07-22,-0.929552,-0.299941,-0.104608,-1.0,-1.0,-1.0
1,2009-07-23,-0.300697,-0.295116,-0.485437,-1.0,-1.0,-1.0
2,2009-07-24,-0.299269,-0.285424,-0.582636,-1.0,-1.0,-1.0
3,2009-07-27,-0.284893,-0.284934,-0.588177,-1.0,-1.0,-1.0
4,2009-07-28,-0.992907,-0.928538,-0.664703,-1.0,-1.0,-1.0


# 6. Bringing together ticker data and action recommendations

In [15]:
# Data is merged by date. Since the date is not a fixed indicator, the date is omitted from the output.
merged = AAPL_enriched_norm.merge(AAPL_labels, on='Date')
merged = merged[['aaplopen', 'aaplclose', 'aaplvolume', 'ebit', 'revenues', 'net profit', 'AAPL']]

In [16]:
merged.head()

Unnamed: 0,aaplopen,aaplclose,aaplvolume,ebit,revenues,net profit,AAPL
0,-0.929552,-0.299941,-0.104608,-1.0,-1.0,-1.0,1
1,-0.300697,-0.295116,-0.485437,-1.0,-1.0,-1.0,1
2,-0.299269,-0.285424,-0.582636,-1.0,-1.0,-1.0,1
3,-0.284893,-0.284934,-0.588177,-1.0,-1.0,-1.0,1
4,-0.992907,-0.928538,-0.664703,-1.0,-1.0,-1.0,1


In [17]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1232 entries, 0 to 1231
Data columns (total 7 columns):
aaplopen      1232 non-null float64
aaplclose     1232 non-null float64
aaplvolume    1232 non-null float64
ebit          1232 non-null float64
revenues      1232 non-null float64
net profit    1232 non-null float64
AAPL          1232 non-null int64
dtypes: float64(6), int64(1)
memory usage: 77.0 KB


In [18]:
# Result is stored for training and testing 
merged = merged.to_csv('prepared data/Data_Preperation_one_ticker_with_inclusion_of_external_data_per_day.csv', index=False)