In [10]:
from os import listdir
import pandas as pd
import datetime

In [58]:
raw_path = '../../data/TSLA/RawData/'
raw_files = listdir(raw_path)
data_path = '../../data/TSLA/ProcData'
data_name = 'proc_data.csv'
output_fp = data_path +'/'+ data_name

In [46]:
# obtain needed data for processing
# Filter out data to use volatility on data an hour prior to market open
# Market Open: NYSE open Monday-Friday 9:30am to 4pm. Eastern time
def raw_to_data(fp):
    
    # read in data
    data = pd.read_csv(fp)
    # convert time column time zone
    data.time = pd.DatetimeIndex(data.time).tz_localize('US/Eastern')
    # filter out data for use
    market_open_time = datetime.time(hour=9, minute=30)
    market_1hr_early = datetime.time(hour=8, minute=30)
    data = data.loc[data.time.apply(lambda date:(date.time()<=market_open_time)
                        and (date.time()>=market_1hr_early)
                        )]
    data.reset_index(drop=True, inplace=True)
    
    return data

In [48]:
# merge processed files from raw path to data path
def merge_data(raw_path):
    
    # output dataframe
    output = pd.DataFrame()
    # all files to be processed
    raw_files = listdir(raw_path)
    
    for curr_file in raw_files:
        fp = raw_path + curr_file
        data = raw_to_data(fp)
        output = pd.concat([output, data])
    
    return output

In [49]:
data = merge_data(raw_path)

In [51]:
data.shape

(23785, 6)

In [52]:
def feature_engineer(data):
    
    # for time series indexing
    data.set_index('time', inplace=True)
    
    # High-Low within minute bar
    data['Day_Perc_Change'] = data['close'].pct_change(periods=1)*100
    
    # Volitility Analysis
    vol = data['close'].rolling(5).mean()
    data = data.assign(Volatility=vol)
    # for modelling purpose
    data.dropna(axis=0, inplace=True)
    
    return data

In [53]:
data = feature_engineer(data)

In [54]:
data.head()

Unnamed: 0_level_0,open,high,low,close,volume,Day_Perc_Change,Volatility
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-15 09:26:00-05:00,852.98,853.25,852.98,853.25,3460,2.3e-05,853.10396
2021-01-15 09:25:00-05:00,852.75,853.0,852.75,852.85,4066,-0.04688,853.25996
2021-01-15 09:24:00-05:00,853.2,853.2,852.5,852.8,5050,-0.005863,852.98996
2021-01-15 09:23:00-05:00,853.43,853.45,853.25,853.25,3859,0.052767,853.07996
2021-01-15 09:22:00-05:00,853.5,853.5,853.5,853.5,1805,0.0293,853.13


In [55]:
data.isnull().sum()

open               0
high               0
low                0
close              0
volume             0
Day_Perc_Change    0
Volatility         0
dtype: int64

In [56]:
data.shape

(23781, 7)

In [59]:

data.to_csv(output_fp)