# E **T** L

### Version 1.0

In [None]:
import pandas as pd # for dataframe operations
import os # for working with filepaths
import datetime # to convert columns in the dataframe
try:
    import _pickle as pickle # for serialization, _pickle == cPickle (faster than pickle)
except:
    import pickle # alternative
from sklearn.preprocessing import StandardScaler # for ml format

In [1]:
class DataConverter():
    
    def __init__(self):
        self.scalers = {}
    
    def convert_df(self, df):
        # replace whitespaces in columns with underscores
        df.columns = [col.replace(' ', '_') for col in df.columns]
        
        # convert datestring to dates 
        df['Date'] = df['Date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))
        df['Year'] = df['Date'].apply(lambda x: x.year)
        df['Month'] = df['Date'].apply(lambda x: x.month)
        df['Day'] = df['Date'].apply(lambda x: x.day)
        df.drop('Date', axis=1, inplace=True)
        return df
    
    def fill_targets(self, df):
        # for each date (except the last one), get the adjusted close price from the next date
        df.sort_values(by=['Year', 'Month', 'Day'], inplace=True) # dates in right order
        next_day_adj_close = df['Adj._Close'].iloc[1:] # get the prices of the next day
        next_day_adj_close.index += 1
        df['Adj._Close_next'] = next_day_adj_close
        df.reset_index(drop=True, inplace=True) # reset index so that earliest date has index 0
        df.drop(df.index.max(), inplace=True) # get rid of the last row, as we don't know the target for this one
        return df
    
    def convert_ml_format(self, df, symbol, target='Adj._Close_next'):
        X = df.drop(target, axis=1).values # whole df except last column (which is the target)
        y = df[target].values # only target column
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
        self.scalers[symbol] = scaler
        return X, y
    
    def convert_x(self, x, symbol):
        try:
            scaler = self.scalers[symbol]
        except:
            raise Exception('Symbol {} not contained in Trainingset, therefore not possible to convert the input.'.format(symbol))
        x = scaler.transform(x.reshape(1, -1))
        return x
    
    def serialize(self, path='serialized_tool_objects/datapreparer.p'):
        with open(path, 'wb') as file:
            pickle.dump(preparer.scalers, file)
    
    def initialize(self, path='serialized_tool_objects/datapreparer.p'):
        with open(path, 'rb') as file:
            self.scalers = pickle.load(file)

    def __repr__(self):
        return 'DataConverter()'

In [2]:
converter = DataConverter()

In [3]:
data_dir = '../data/raw'
for file in os.listdir(data_dir):
    if file.endswith(".csv"):
        csv_file = os.path.join(data_dir, file)
        
        # train a model and save it in directory "models"
        # will later on be used in an ensemble predictor

In [4]:
df = pd.read_csv('../data/raw/MMM.csv')
df = converter.convert_df(df)
df = converter.fill_targets(df)
X, y = converter.convert_ml_format(df, 'MMM')

In [5]:
df.head()

Unnamed: 0,Open,High,Low,Close,Volume,Ex-Dividend,Split_Ratio,Adj._Open,Adj._High,Adj._Low,Adj._Close,Adj._Volume,Year,Month,Day,Adj._Close_next
0,109.62,110.25,109.5,109.62,4500.0,0.0,1.0,2.927425,2.94425,2.924221,2.927425,72000.0,1970,1,2,2.94425
1,109.75,110.38,109.75,110.25,27900.0,0.0,1.0,2.930897,2.947721,2.930897,2.94425,446400.0,1970,1,5,2.974427
2,110.25,111.38,110.12,111.38,11000.0,0.0,1.0,2.94425,2.974427,2.940778,2.974427,176000.0,1970,1,6,2.990984
3,111.38,112.25,111.12,112.0,10300.0,0.0,1.0,2.974427,2.99766,2.967483,2.990984,164800.0,1970,1,7,3.031042
4,112.0,113.75,111.75,113.5,19000.0,0.0,1.0,2.990984,3.037718,2.984307,3.031042,304000.0,1970,1,8,3.024365
