# Data preperation of internal data without time courses

This notebook prepares the data for training and testing. For this purpose the file with the recommendations for action is merged with the ticker data. In order to avoid long loading times, only one ticker is used in the first iteration. In the following the ticker AAPL is used.

# Content
1. Import dependencies
2. Helpers
3. Load file with action recommendations
4. Load ticker data using AAPL as an example
5. Normalization of ticker data
6. Bringing together ticker data and action recommendations

<hr>

# 1. Import dependencies

In [2]:
import numpy as np
import pandas as pd
import datetime

# 2. Helpers

In [3]:
# Different datatypes lead to problems. To obtain the correct data types, the following function can be used.
def convertToUsefulDatatypes(df):
    for column in df:
        if column == "Date":
            df[column] = pd.to_datetime(df[column])
        else:
            df[column] = pd.to_numeric(df[column])
    return df

# 3. Load file with action recommendations

In [4]:
train = pd.read_csv('data/labels_train.csv', sep=',', decimal=',')
train = convertToUsefulDatatypes(train)

In [5]:
train.shape

(2518, 836)

In [6]:
train.head()

Unnamed: 0,Date,A,AAN,AAP,AAPL,AAXN,ABC,ABCD,ABG,ABM,...,XEL,XOM,XPO,XRAY,XRX,XSPY,XYL,Y,YELP,ZBH
0,2008-01-02,0,1,0,-1,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2008-01-03,0,1,0,-1,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2008-01-04,0,1,0,0,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2008-01-07,0,1,0,0,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2008-01-08,0,1,0,0,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 4. Load ticker data using AAPL as an example

In [7]:
aapl= pd.read_csv('data/stocks/AAPL.csv', sep=',', decimal=',')
aapl = convertToUsefulDatatypes(aapl)

In [8]:
aapl.shape

(2736, 6)

In [9]:
aapl.head()

Unnamed: 0,Date,aaplopen,aaplclose,aapllow,aaplhigh,aaplvolume
0,2008-01-02,28.467142,27.834286,27.507143,28.608572,269794700.0
1,2008-01-03,27.915714,27.847143,27.527143,28.198572,210516600.0
2,2008-01-04,27.35,25.721428,25.555714,27.571428,363958000.0
3,2008-01-07,25.892857,25.377142,24.318571,26.228571,518048300.0
4,2008-01-08,25.734285,24.464285,24.4,26.065714,380954000.0


# 5. Normalization of ticker data

### In order to avoid inconsistencies, a normalization is subsequently performed. Date fields are excluded from normalization. The result of training and testing is achieved by approx. 10 % through normalization. 

In [10]:
def normalize(x, col_min, col_max):
    return (2.0 * x - col_max - col_min) / (col_max - col_min)

In [11]:
def normalizeList(col, col_min=None, col_max=None):
    if (col_min == None):
        col_min = min(col)
    if (col_max == None):
        col_max = max(col)
    
    return [normalize(x, col_min, col_max) for x in col]

In [12]:
def normalizeDataframe(df, min_max_df=None):
    if (min_max_df == None):
        min_max_df = pd.DataFrame(index=['min', 'max'])
        for column in df:
            if (column != 'Date'):
                col_min = min(df[column])
                col_max = max(df[column])
                min_max_df[column] = pd.Series([col_min, col_max], index = min_max_df.index)
    
    for column in df:
        if (column != 'Date'):
            norm_list = normalizeList(df[column].tolist(), min_max_df[column][0], min_max_df[column][1])
            df[column] = pd.Series(norm_list, index = df.index)
            
    return df

In [13]:
aapl_norm = normalizeDataframe(aapl)
aapl_norm.head()

Unnamed: 0,Date,aaplopen,aaplclose,aapllow,aaplhigh,aaplvolume
0,2008-01-02,-0.843913,-0.849136,-0.850548,-0.847632,-0.378867
1,2008-01-03,-0.848939,-0.849019,-0.850365,-0.851329,-0.521402
2,2008-01-04,-0.854095,-0.868265,-0.868401,-0.856985,-0.152449
3,2008-01-07,-0.867376,-0.871382,-0.87972,-0.869097,0.218064
4,2008-01-08,-0.868821,-0.879647,-0.878975,-0.870565,-0.111582


# 6. Bringing together ticker data and action recommendations

In [14]:
# Data is merged by date. Since the date is not a fixed indicator, the date is omitted from the output.
merged = train.merge(aapl_norm, on='Date')
merged = merged[['AAPL', 'aaplopen', 'aaplclose', 'aapllow', 'aaplhigh', 'aaplvolume']]

In [15]:
merged.head()

Unnamed: 0,AAPL,aaplopen,aaplclose,aapllow,aaplhigh,aaplvolume
0,-1,-0.843913,-0.849136,-0.850548,-0.847632,-0.378867
1,-1,-0.848939,-0.849019,-0.850365,-0.851329,-0.521402
2,0,-0.854095,-0.868265,-0.868401,-0.856985,-0.152449
3,0,-0.867376,-0.871382,-0.87972,-0.869097,0.218064
4,0,-0.868821,-0.879647,-0.878975,-0.870565,-0.111582


In [16]:
# Result is stored for training and testing 
merged = merged.to_csv('prepared data/Data_Preperation_one_ticker_internal.csv', index=False)