# Data Preparation - Calculate movement rates for 20 ticker

This notebook is used to prepare the data in the same way as the file "1.1.1 Data Preparation - Calculate movement rates - single ticker", but instead of one ticker this file will now increase the number of tickers from one to 20.

The idea behind this is to generate more data from where the sklearn models can learn from.

# Content

 1. Import dependencies
 2. Load data
 3. Define tickers
 4. Calculate movement rates and safe dataframe

<hr>

# 1. Import dependencies

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series
from sklearn import preprocessing
from  pandas  import  DataFrame 
from  pandas  import  concat 

%matplotlib inline

# 2. Load data

#### Define function for convert our series to supervised learning using DataFrame.shift

In [3]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

# 3. Define tickers

The following tickers are selected per random choice

In [4]:
tickers = ['AAPL','IBM','ASNA','CAT','EXPE','GME','IMO','MCO','PLD','GT','XYL','WTR','WFT','UNH','GPS','TMO',
          'JNJ','AMZN','HOG','CBB']

# 4. Calculate movement rates and safe dataframe

In [5]:
liste = []
new_xx_df = pd.DataFrame(liste,columns=["Entwicklungsrate Volume t+10","Entwicklungsrate Volume t+20",
                                        "Entwicklungsrate Volume t+30","Entwicklungsrate Volume t+40",
                                        "Entwicklungsrate Volume t+50","Entwicklungsrate Volume t+60",
                                        "Entwicklungsrate Volume t+70","Entwicklungsrate Volume t+80",
                                        "Entwicklungsrate Volume t+90","Entwicklungsrate Preis t+10",
                                        "Entwicklungsrate Preis t+20","Entwicklungsrate Preis t+30",
                                        "Entwicklungsrate Preis t+40","Entwicklungsrate Preis t+50",
                                        "Entwicklungsrate Preis t+60","Entwicklungsrate Preis t+70",
                                        "Entwicklungsrate Preis t+80","Entwicklungsrate Preis t+90",'Y'])
for x in range(0,len(tickers)):
    ticker = tickers[x]
    print(ticker)
    
    df_ticker = pd.read_csv("data/stocks/" + ticker + ".csv")
    df_ticker.columns = ['Date','Open','Close','Low','High','Volume']

    df_ticker['Mid_prices'] = (df_ticker.Low + df_ticker.High) / 2
    df_for_midprices = df_ticker.copy(deep=True)
    df_midprices = df_for_midprices.drop(['Date','Open','High','Low','Close','Volume'], axis=1)

    midprices = df_midprices.values
    reframed_midprices = series_to_supervised(midprices, 0, 91)

    # Calculate values
    entwicklungsrate_midprices_10 = (1/reframed_midprices['var1(t)']*reframed_midprices['var1(t+10)'])-1
    entwicklungsrate_midprices_20 = (1/reframed_midprices['var1(t)']*reframed_midprices['var1(t+20)'])-1
    entwicklungsrate_midprices_30 = (1/reframed_midprices['var1(t)']*reframed_midprices['var1(t+30)'])-1
    entwicklungsrate_midprices_40 = (1/reframed_midprices['var1(t)']*reframed_midprices['var1(t+40)'])-1
    entwicklungsrate_midprices_50 = (1/reframed_midprices['var1(t)']*reframed_midprices['var1(t+50)'])-1
    entwicklungsrate_midprices_60 = (1/reframed_midprices['var1(t)']*reframed_midprices['var1(t+60)'])-1
    entwicklungsrate_midprices_70 = (1/reframed_midprices['var1(t)']*reframed_midprices['var1(t+70)'])-1
    entwicklungsrate_midprices_80 = (1/reframed_midprices['var1(t)']*reframed_midprices['var1(t+80)'])-1
    entwicklungsrate_midprices_90 = (1/reframed_midprices['var1(t)']*reframed_midprices['var1(t+90)'])-1

    df_for_midprices['Entwicklungsrate Preis t+10'] = entwicklungsrate_midprices_10
    df_for_midprices['Entwicklungsrate Preis t+20'] = entwicklungsrate_midprices_20
    df_for_midprices['Entwicklungsrate Preis t+30'] = entwicklungsrate_midprices_30
    df_for_midprices['Entwicklungsrate Preis t+40'] = entwicklungsrate_midprices_40
    df_for_midprices['Entwicklungsrate Preis t+50'] = entwicklungsrate_midprices_50
    df_for_midprices['Entwicklungsrate Preis t+60'] = entwicklungsrate_midprices_60
    df_for_midprices['Entwicklungsrate Preis t+70'] = entwicklungsrate_midprices_70
    df_for_midprices['Entwicklungsrate Preis t+80'] = entwicklungsrate_midprices_80
    df_for_midprices['Entwicklungsrate Preis t+90'] = entwicklungsrate_midprices_90
    df_for_midprices = df_for_midprices.drop(['Open','High','Low','Close','Volume','Mid_prices'], axis=1)

    df_for_volumes = df_ticker.copy(deep=True)
    df_volumes = df_for_volumes.drop(['Date','Open','High','Low','Close','Mid_prices'], axis=1)

    volumes = df_volumes.values
    reframed_volumes = series_to_supervised(volumes, 0, 91)

    entwicklungsrate_volumes_10 = (1/reframed_volumes['var1(t)']*reframed_volumes['var1(t+10)'])-1
    entwicklungsrate_volumes_20 = (1/reframed_volumes['var1(t)']*reframed_volumes['var1(t+20)'])-1
    entwicklungsrate_volumes_30 = (1/reframed_volumes['var1(t)']*reframed_volumes['var1(t+30)'])-1
    entwicklungsrate_volumes_40 = (1/reframed_volumes['var1(t)']*reframed_volumes['var1(t+40)'])-1
    entwicklungsrate_volumes_50 = (1/reframed_volumes['var1(t)']*reframed_volumes['var1(t+50)'])-1
    entwicklungsrate_volumes_60 = (1/reframed_volumes['var1(t)']*reframed_volumes['var1(t+60)'])-1
    entwicklungsrate_volumes_70 = (1/reframed_volumes['var1(t)']*reframed_volumes['var1(t+70)'])-1
    entwicklungsrate_volumes_80 = (1/reframed_volumes['var1(t)']*reframed_volumes['var1(t+80)'])-1
    entwicklungsrate_volumes_90 = (1/reframed_volumes['var1(t)']*reframed_volumes['var1(t+90)'])-1

    df_for_volumes['Entwicklungsrate Volume t+10'] = entwicklungsrate_volumes_10
    df_for_volumes['Entwicklungsrate Volume t+20'] = entwicklungsrate_volumes_20
    df_for_volumes['Entwicklungsrate Volume t+30'] = entwicklungsrate_volumes_30
    df_for_volumes['Entwicklungsrate Volume t+40'] = entwicklungsrate_volumes_40
    df_for_volumes['Entwicklungsrate Volume t+50'] = entwicklungsrate_volumes_50
    df_for_volumes['Entwicklungsrate Volume t+60'] = entwicklungsrate_volumes_60
    df_for_volumes['Entwicklungsrate Volume t+70'] = entwicklungsrate_volumes_70
    df_for_volumes['Entwicklungsrate Volume t+80'] = entwicklungsrate_volumes_80
    df_for_volumes['Entwicklungsrate Volume t+90'] = entwicklungsrate_volumes_90
    df_for_volumes = df_for_volumes.drop(['Open','High','Low','Close','Volume','Mid_prices'], axis=1)

    merged_df = pd.merge(df_for_volumes,df_for_midprices,on='Date')


    df_train_label = pd.read_csv('data/labels_train.csv', header=0, index_col=0)
    
      
    df_train_label = df_train_label.loc[:, df_train_label.columns.intersection([ticker])]
    
                                      
    df_train_label.columns = ['Y']


    df_complete = pd.merge(merged_df,df_train_label[['Y']],on='Date')
    df_complete = df_complete.sort_values('Date')
    df_complete = df_complete.drop('Date', axis=1)
    print(df_complete.shape)
    
    
    new_xx_df = new_xx_df.append(df_complete)
    print(new_xx_df.shape)
    

# Save dataframe as csv file
new_xx_df.to_csv('Prepared data/Training_set-sklearn_20 Ticker.csv', encoding='utf-8')

print("Abgeschlossen") 


AAPL
(2518, 19)
(2518, 19)
IBM
(2518, 19)
(5036, 19)
ASNA
(2518, 19)
(7554, 19)
CAT
(2518, 19)
(10072, 19)
EXPE
(2518, 19)
(12590, 19)
GME
(2518, 19)
(15108, 19)
IMO
(2518, 19)
(17626, 19)
MCO
(2518, 19)
(20144, 19)
PLD
(2518, 19)
(22662, 19)
GT
(2518, 19)
(25180, 19)
XYL
(1564, 19)
(26744, 19)
WTR
(2518, 19)
(29262, 19)
WFT
(2518, 19)
(31780, 19)
UNH
(2518, 19)
(34298, 19)
GPS
(2518, 19)
(36816, 19)
TMO
(2518, 19)
(39334, 19)
JNJ
(2518, 19)
(41852, 19)
AMZN
(2518, 19)
(44370, 19)
HOG
(2518, 19)
(46888, 19)
CBB
(2518, 19)
(49406, 19)
Abgeschlossen
