In [3]:
import os
from os import listdir
import pandas as pd
import datetime
import cv2

ModuleNotFoundError: No module named 'cv2'

In [4]:
from PIL import Image 

In [14]:
# file paths

# raw data path
raw_path = '../../data/TSLA/RawData/'
# processed data path
data_path = '../../data/TSLA/ProcData'
# merged filtered file name
merged_data = 'merge_filter_data.csv'
# feature engineered file name
feat_data = 'feat_eng_data.csv'
# data with time + volatility
vol_data = 'data_vol.csv'

Merge and Preprocess raw data

In [15]:
# obtain needed data for processing
# Filter out data to use volatility on data an hour prior to market open
# Market Open: NYSE open Monday-Friday 9:30am to 4pm. Eastern time
def raw_to_data(fp):
    
    # read in data
    data = pd.read_csv(fp)
    # convert time column time zone
    data.time = pd.DatetimeIndex(data.time).tz_localize('US/Eastern')
    # filter out data for use
    market_open_time = datetime.time(hour=9, minute=30)
    market_1hr_early = datetime.time(hour=8, minute=30)
    data = data.loc[data.time.apply(lambda date:(date.time()<=market_open_time)
                        and (date.time()>=market_1hr_early)
                        )]
    data.reset_index(drop=True, inplace=True)
    
    return data

In [16]:
# merge processed files from raw path to data path
def merge_data(raw_path):
    
    # output dataframe
    output = pd.DataFrame()
    # all files to be processed
    raw_files = listdir(raw_path)
    
    for curr_file in raw_files:
        fp = raw_path + curr_file
        data = raw_to_data(fp)
        output = pd.concat([output, data])
    
    return output

In [22]:
data = merge_data(raw_path)
# data.to_csv(data_path+'/'+merged_data, index=False)

In [23]:
# data = pd.read_csv(data_path+'/'+merged_data, parse_dates=['time'])

Feature Engineer preprocessed data for volatility analysis

In [24]:
def feature_engineer(data, time_wdw):
    
    # for time series indexing
    data.set_index('time', inplace=True)
    
    # High-Low within minute bar
    data['diff'] = data.high-data.low
    # Volitility Analysis
    vol = data['diff'].rolling(time_wdw).mean()
    data = data.assign(Volatility=vol)
    # for modelling purpose
    data.dropna(axis=0, inplace=True)
    data.reset_index(inplace = True)
    
    return data

In [25]:
data = feature_engineer(data, 5)

# data.to_csv(data_path+'/'+feat_data, index=False)

Volatility Dataframe --> Dataframe for Gramium Angular Field

In [26]:
data = pd.read_csv(data_path+'/'+feat_data, parse_dates=['time'])
data.head()

Unnamed: 0,time,open,high,low,close,volume,diff,Volatility
0,2021-01-15 09:26:00-05:00,852.98,853.25,852.98,853.25,3460,0.27,1.20796
1,2021-01-15 09:25:00-05:00,852.75,853.0,852.75,852.85,4066,0.25,0.52396
2,2021-01-15 09:24:00-05:00,853.2,853.2,852.5,852.8,5050,0.7,0.36396
3,2021-01-15 09:23:00-05:00,853.43,853.45,853.25,853.25,3859,0.2,0.31396
4,2021-01-15 09:22:00-05:00,853.5,853.5,853.5,853.5,1805,0.0,0.284


In [29]:
data = data.assign(date=data.time.apply(lambda x:x.date()))
data = data[['date','Volatility']]
# data.to_csv(data_path+'/'+vol_data, index=False)

In [30]:
data.date.value_counts().min()

7

In [11]:
data.groupby()

7

In [28]:
data.head()

Unnamed: 0,date,Volatility
0,2021-01-15,1.20796
1,2021-01-15,0.52396
2,2021-01-15,0.36396
3,2021-01-15,0.31396
4,2021-01-15,0.284


In [31]:
data = pd.read_csv(data_path+'/'+vol_data, parse_dates=['date'])

In [38]:
data.sort_values(by='date', ascending=True, inplace=True)
data.head()

Unnamed: 0,date,Volatility
12913,2019-01-30,0.0168
12919,2019-01-30,0.020076
12920,2019-01-30,0.020076
12921,2019-01-30,0.023276
12922,2019-01-30,0.023276


In [41]:
data.date.value_counts(sort=False).sort_index()#.min()

2019-01-30    46
2019-01-31    55
2019-02-01    27
2019-02-04    37
2019-02-05    19
2019-02-06    30
2019-02-07    30
2019-02-08    30
2019-02-11    47
2019-02-12    33
2019-02-13    25
2019-02-14    44
2019-02-15    22
2019-02-19    20
2019-02-20    58
2019-02-21    18
2019-02-22    47
2019-02-25    27
2019-02-26    48
2019-02-27    31
2019-02-28    57
2019-03-01    54
2019-03-04    51
2019-03-05    58
2019-03-06    46
2019-03-07    45
2019-03-08    49
2019-03-11    45
2019-03-12    44
2019-03-13    34
2019-03-14    45
2019-03-15    59
2019-03-18    40
2019-03-19    53
2019-03-20    28
2019-03-21    29
2019-03-22    39
2019-03-25    59
2019-03-26    35
2019-03-27    30
2019-03-28    46
2019-03-29    38
2019-04-01    44
2019-04-02    27
2019-04-03    40
2019-04-04    61
2019-04-05    55
2019-04-08    47
2019-04-09    43
2019-04-10    39
2019-04-11    59
2019-04-12    33
2019-04-15    32
2019-04-16    35
2019-04-17    25
2019-04-18    31
2019-04-22    50
2019-04-23    60
2019-04-24    

In [36]:
pd.set_option('display.max_rows', 500)

In [11]:
import matplotlib.pyplot as plt

In [12]:
temp_img = plt.imread('imgs/temp.png')
# temp_img = Image.open('imgs/temp.png')

In [14]:
temp_img.shape

(235, 235, 4)

In [None]:
def df_gaf(data):
    
    # number of features per day set equal to be the min
    num_feats = data.date.value_counts().min()
    
    

In [13]:
data.head()

Unnamed: 0,time,Volatility
0,2021-01-15 09:26:00-05:00,1.20796
1,2021-01-15 09:25:00-05:00,0.52396
2,2021-01-15 09:24:00-05:00,0.36396
3,2021-01-15 09:23:00-05:00,0.31396
4,2021-01-15 09:22:00-05:00,0.284
