In [15]:
import os
from os import listdir
import pandas as pd
import datetime

In [14]:
# file paths

# raw data path
raw_path = '../../data/TSLA/RawData/'
# processed data path
data_path = '../../data/TSLA/ProcData'
# merged filtered file name
merged_data = 'merge_filter_data.csv'
# feature engineered file name
feat_data = 'feat_eng_data.csv'
# data with time + volatility
vol_data = 'data_vol.csv'

Merge and Preprocess raw data

In [15]:
# obtain needed data for processing
# Filter out data to use volatility on data an hour prior to market open
# Market Open: NYSE open Monday-Friday 9:30am to 4pm. Eastern time
def raw_to_data(fp):
    
    # read in data
    data = pd.read_csv(fp)
    # convert time column time zone
    data.time = pd.DatetimeIndex(data.time).tz_localize('US/Eastern')
    # filter out data for use
    market_open_time = datetime.time(hour=9, minute=30)
    market_1hr_early = datetime.time(hour=8, minute=30)
    data = data.loc[data.time.apply(lambda date:(date.time()<=market_open_time)
                        and (date.time()>=market_1hr_early)
                        )]
    data.reset_index(drop=True, inplace=True)
    
    return data

In [16]:
# merge processed files from raw path to data path
def merge_data(raw_path):
    
    # output dataframe
    output = pd.DataFrame()
    # all files to be processed
    raw_files = listdir(raw_path)
    
    for curr_file in raw_files:
        fp = raw_path + curr_file
        data = raw_to_data(fp)
        output = pd.concat([output, data])
    
    return output

In [22]:
data = merge_data(raw_path)
# data.to_csv(data_path+'/'+merged_data, index=False)

In [23]:
# data = pd.read_csv(data_path+'/'+merged_data, parse_dates=['time'])

Feature Engineer preprocessed data for volatility analysis

In [24]:
def feature_engineer(data, time_wdw):
    
    # for time series indexing
    data.set_index('time', inplace=True)
    
    # High-Low within minute bar
    data['diff'] = data.high-data.low
    # Volitility Analysis
    vol = data['diff'].rolling(time_wdw).mean()
    data = data.assign(Volatility=vol)
    # for modelling purpose
    data.dropna(axis=0, inplace=True)
    data.reset_index(inplace = True)
    
    return data

In [25]:
data = feature_engineer(data, 5)

# data.to_csv(data_path+'/'+feat_data, index=False)

Date and Volatility Column for Further Inspection

In [26]:
data = pd.read_csv(data_path+'/'+feat_data, parse_dates=['time'])
data.head()

Unnamed: 0,time,open,high,low,close,volume,diff,Volatility
0,2021-01-15 09:26:00-05:00,852.98,853.25,852.98,853.25,3460,0.27,1.20796
1,2021-01-15 09:25:00-05:00,852.75,853.0,852.75,852.85,4066,0.25,0.52396
2,2021-01-15 09:24:00-05:00,853.2,853.2,852.5,852.8,5050,0.7,0.36396
3,2021-01-15 09:23:00-05:00,853.43,853.45,853.25,853.25,3859,0.2,0.31396
4,2021-01-15 09:22:00-05:00,853.5,853.5,853.5,853.5,1805,0.0,0.284


In [29]:
data = data.assign(date=data.time.apply(lambda x:x.date()))
data = data[['date','Volatility']]
# data.to_csv(data_path+'/'+vol_data, index=False)