Imports and user defined functions

In [1]:
import os
import numpy as np
import pandas as pd
import arch as ac

from os import error
from collections import OrderedDict as dct
from random import gauss
from random import seed
from matplotlib import pyplot
from statsmodels.graphics.tsaplots import plot_acf

In [2]:
# Summarizes object characteristics
def check_types_df(df):
    print(
        "Shape: ", df.shape, "\n"
        "Attributes: ", df.attrs, "\n"
        )
    print(df.info())
def do_call(which, args=None, kwargs = None):
    if args is None and kwargs is not None:
        return which(**kwargs)
    elif args is not None and kwargs is None:
        return which(*args)
    else:
        return which(*args, **kwargs)

In [3]:
# Function is designed to change the data of dim (:,6) to the correct dtypes
def fix_dtypes(df):
    df.Date  = pd.to_datetime(df.Date, errors="coerce")
    df.Open  = pd.to_numeric(df.Open, downcast="float", errors="coerce")
    df.High  = pd.to_numeric(df.High, downcast="float", errors="coerce")
    df.Low   = pd.to_numeric(df.Low, downcast="float", errors="coerce")
    df.Close = pd.to_numeric(df.Close, downcast="float", errors="coerce")
    return df
# Function is to test data and for nulls introduced in wrangling process
# NOTE: Does not seem to be working appropriately
def check_nulls(lst, Keys):
    from collections import OrderedDict
    
    lst_nans = OrderedDict()
    test_results = OrderedDict()

    if len(lst) != len(Keys):
        raise error("List & Keys object are not of the same length.")
    else:
        for j in range(len(lst)):
            df = lst[Keys[j]]
            key = Keys[j]
            
            # Tests if there are any nulls in a list of dataframes
            # Plural
            test_results[key] = lst[Keys[j]].isnull().any()
            # Singular
            res_j = test_results[key].astype(int)

            for i in range(len(res_j)):
                if (res_j[i] == 1):
                    print(
                        "Nulls detected.", "\n",
                        "List index = ", j, "\n",
                        "Dataframe index = ", i, "\n",
                        )
                    # if any null, returns the offending rows of the df
                    # df_nans = res_j[res_j.isna().any(axis=1)]
                    lst_nans[key]= res_j[res_j.isna().any(axis=1)]
                else:
                    next
    return lst_nans, test_results

DATA: Get data

In [4]:
# get directories of files
d_root = "C:/Users/Keegan/OneDrive/1 Studies/2021 - 2022/5003W/3 - Dissertation/5-Data/multi_series_data/intraday/clean/longest/"
files = os.listdir(d_root)
dirs = [d_root + x for x in files]

keys = [a.removesuffix("_intra_clean.xlsx") for a in files]
keys = [a.removesuffix("_index_clean.xlsx") for a in keys]

if len(files) == len(keys):
    exit
else:
    raise error("Object did not retain length. Check processing for errors.")

In [5]:
data = dct()
for i in range(len(dirs)):
    open(dirs[i])
    data[keys[i]] = pd.read_excel(dirs[i])

cols = list(data[keys[0]].columns)

length = []
for i in range(len(keys)):
    length.append(len(data[keys[i]]))
print("total number of observations in all data: ", sum(length)*4)

total number of observations in all data:  3249852


In [6]:
[print("Incorrect obj type detected. i= ", i) for i in data if type(data[i]) != pd.DataFrame]

[]

In [7]:
test = data[keys[1]].copy(deep=True)

In [8]:
check_types_df(test)

Shape:  (46732, 5) 
Attributes:  {} 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46732 entries, 0 to 46731
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    46732 non-null  datetime64[ns]
 1   Open    46732 non-null  float64       
 2   High    46732 non-null  float64       
 3   Low     46732 non-null  float64       
 4   Close   46732 non-null  float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 1.8 MB
None


Ensure correct datatypes

In [9]:
# ensure dtypes are correct
data_typed = dct() #[fix_dtypes(data[key]) for key in keys]

for key in keys:
    temp_df = data[key].copy(deep=True)
    data_typed[key] = fix_dtypes(temp_df)
    del temp_df
    
# Take single sample
samp = data_typed[keys[0]].copy(deep=True)
samp.Name = keys[0]

# Check if data was preserved. 
# True -> it wasn't
test_results = {}
for i in range(len(keys)):
    test_results[keys[i]] = data[keys[i]].isnull().any()
pd.DataFrame(test_results).transpose()

Unnamed: 0,Date,Open,High,Low,Close
as51,False,False,False,False,False
cac,False,False,False,False,False
dax,False,False,False,False,False
ftsemib,False,False,False,False,False
hsi,False,False,False,False,False
ibex,False,False,False,False,False
ibov,False,False,False,False,False
jalsh,False,False,False,False,False
jci,False,False,False,False,False
kospi,False,False,False,False,False


In [10]:
# [check_types_df(data_typed[key]) for key in keys]
# NOTE: Last checked on the 27 Sept 2022, all dtypes are correct

In [11]:
lst_of_sus = [6,10,14,15,16,18,19]
# [print(dirs[i],'\n',i) for i in lst_of_sus]
# NOTE: files referred to by lst_of_sus are cleared of suspicions

In [12]:
# add column containing nulls 
nul = []
num = []
for i in (range(int(0.5*len(test)))):
    num.append(float(1.00))
    num.append(float(2.00))
nulnum = nul + num
test["NulCol"] = num
# create column with np.nan
test["NulCol"].replace(to_replace = float(2), value=np.nan, inplace=True)
test[test.isna().any(axis=1)]

Unnamed: 0,Date,Open,High,Low,Close,NulCol
1,2022-03-11 11:00:00,6293.65,6293.65,6263.65,6269.63,
3,2022-03-11 10:00:00,6286.21,6295.35,6268.27,6281.48,
5,2022-03-11 09:00:00,6354.23,6360.03,6322.89,6322.89,
7,2022-03-11 08:00:00,6314.75,6342.02,6311.44,6341.06,
9,2022-03-11 07:00:00,6348.43,6373.69,6338.34,6357.36,
...,...,...,...,...,...,...
46723,2012-01-18 07:00:00,3283.25,3283.25,3263.88,3269.45,
46725,2012-01-18 06:00:00,3283.73,3286.09,3274.93,3278.59,
46727,2012-01-18 05:00:00,3263.53,3301.89,3260.63,3290.42,
46729,2012-01-18 04:00:00,3254.52,3259.99,3249.33,3250.47,


In [13]:
def log_returns(df_prices):
    # Func sorts data in ascending order & calculates the log-returns
    from numpy import log
    from pandas import Series, DataFrame, DatetimeIndex, concat
    
    try:
        df = df_prices.copy(deep=True).set_index('Date').sort_index()
        
        op = Series(log(df.Open.div(df.Open.shift(1)))).dropna()
        hi = Series(log(df.High.div(df.High.shift(1)))).dropna()
        lo = Series(log(df.Low.div(df.Low.shift(1)))).dropna()
        cl = Series(log(df.Close.div(df.Close.shift(1)))).dropna()
        
        df = concat([op,hi,lo,cl], axis=1, verify_integrity=True)
        return df
    except KeyError:
       print("Please ensure 'Date' column exists in Dataframe before proceeding.")
        

In [14]:
# get copy and calc log-returns
as51 = samp.copy(deep=True)
as51 = log_returns(as51)

In [None]:
def data_splitter(df, max_len, test=0.25, train=(1-test)):
    from pandas import Timestamp as tp
    from sklearn.model_selection import train_test_split

    up_bound = tp(df.index[-1])
    lo_bound = tp(df.index[0])

    train_lst = list()
    test_lst = list()
    train_lst = df.loc[]
    test_lst = df.loc[]

In [18]:
from sklearn.model_selection import train_test_split

demo_set = as51.head(500)

import numpy as np
from sklearn.model_selection import train_test_split
x, y = np.arange(10).reshape((5, 2)), range(5)

x = as51.Open
y = as51.index

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, shuffle=False)

In [25]:
# The  same model can be manually assembled from the building blocks of an ARCH model
from arch.univariate import ConstantMean, HARX, FIGARCH, Normal, GeneralizedError, SkewStudent

# Model 1
am1 = ConstantMean(as51.Open)
am1.volatility = FIGARCH(1, 1, 2)
am1.distribution = SkewStudent()
# Model 2
am2 = HARX(as51.Open)
am2.volatility = FIGARCH(1, 1, 2)
am2.distribution = SkewStudent()


In [29]:
from statsmodels.stats.stattools import robust_skewness
from scipy.stats import skew
rskew = robust_skewness(x_train)
sskew = skew(x_train)

In [30]:
print(
    sskew, '\n',
    rskew
)

-0.6984750803571317 
 (-0.6984755084426864, 0.028655192249464046, 0.007340919504942779, 0.004146251840600463)


In [None]:
def skew_gen(l_df):
    LEN = len(l_df)
    

In [120]:
as51.head()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-01-03 18:30:00,-0.001599,-0.004113,-0.00019,-0.000506
2011-01-03 19:00:00,-0.000969,-0.002021,-0.003651,-0.003609
2011-01-03 19:30:00,-0.003419,-0.000169,2.1e-05,0.001035
2011-01-03 20:00:00,0.00055,-0.002067,2.1e-05,-0.000317
2011-01-03 20:30:00,0.000232,0.002531,0.000761,0.002448


In [None]:
import seaborn as sns
te = (samp.Open/samp.Close)
sns.distplot(te)

In [97]:
as51.index[-2]

Timestamp('2022-03-10 23:30:00')

In [94]:
as51.loc[pd.Timestamp(as51.index[-2])]

Open    -0.001088
High    -0.001088
Low     -0.001088
Close   -0.001088
Name: 2022-03-10 23:30:00, dtype: float32

In [None]:

# The  same model can be manually assembled from the building blocks of an ARCH model
from arch.univariate import ConstantMean, HARX, FIGARCH, Normal, GeneralizedError, SkewStudent

# Model 1
am1 = ConstantMean(as51)
am1.volatility = FIGARCH(1, 1, 2)
am1.distribution = Normal()
# Model 2
am2 = HARX(as51)
am2.volatility = FIGARCH(1, 1, 2)
am2.distribution = SkewStudent()
# Model 3
am3 = HARX(as51)
am3.volatility = FIGARCH(1, 1, 2)
am3.distribution = GeneralizedError()

In [13]:
ac.doc()