### DateTime Intervals, Aggregates ... 

In [2]:
# import Pandas version > 1.10.  locally: use base(python 3.9.12)
import pandas as pd 
pd.set_option('expand_frame_repr', False)
# pd.set_option('display.max_columns', 0) # Display any number of columns
# pd.set_option('display.max_rows', 0) # Display any number of rows
# set max columns to display
pd.set_option('display.max_columns', 50)

In [3]:
# importing data  "C:\\August2022\\ES 12-22 Globex_2022_09_21.csv"
initial_data = pd.read_csv("C:\\August2022\\ES 12-22 Globex_2022_09_21.csv", parse_dates=['DateTime'])

In [59]:
initial_data.head()

Unnamed: 0,DateTime,Vol,BidAsk,Bid,Ask,Price
0,2022-09-21 00:00:00:720,1,AtBid,3879.5,3879.75,3879.5
1,2022-09-21 00:00:00:720,1,AtBid,3879.5,3879.75,3879.5
2,2022-09-21 00:00:00:900,1,AtAsk,3879.25,3879.75,3879.75
3,2022-09-21 00:00:00:900,1,AtAsk,3879.25,3879.75,3879.75
4,2022-09-21 00:00:00:900,1,AtAsk,3879.25,3879.75,3879.75


In [None]:
initial_data.shape # (1742042, 6)

In [57]:
initial_data.columns

Index(['DateTime', 'Vol', ' BidAsk', ' Bid', ' Ask', ' Price '], dtype='object')

In [12]:
initial_data[' BidAsk'].unique()

array(['AtBid', 'AtAsk', 'BelowBid', 'AboveAsk', 'BetweenBidAsk'],
      dtype=object)

In [60]:
map_BA = {'AtBid': -1, 'BelowBid': -2, 'AtAsk':1, 'AboveAsk': 2, 'BetweenBidAsk': 0}
df = initial_data.replace({' BidAsk': map_BA}) 
df.sample(3)
cols = {' BidAsk': 'BidAsk', ' Bid': 'Bid', ' Ask':'Ask', ' Price ': 'Price'}  # strip the space from columns and rename
df.rename(columns=cols, inplace=True)
df.columns

Index(['DateTime', 'Vol', 'BidAsk', 'Bid', 'Ask', 'Price'], dtype='object')

In [61]:
df.dtypes

DateTime     object
Vol           int64
BidAsk        int64
Bid         float64
Ask         float64
Price       float64
dtype: object

## Aggregating data based on Intervals  Hour, Month, Offset...    
- Combining data based on different Time Intervals.

In [92]:
df1 = df.copy()   #DateTime.head().dtypes

In [93]:
df1.head()

Unnamed: 0,DateTime,Vol,BidAsk,Bid,Ask,Price
0,2022-09-21 00:00:00:720,1,-1,3879.5,3879.75,3879.5
1,2022-09-21 00:00:00:720,1,-1,3879.5,3879.75,3879.5
2,2022-09-21 00:00:00:900,1,1,3879.25,3879.75,3879.75
3,2022-09-21 00:00:00:900,1,1,3879.25,3879.75,3879.75
4,2022-09-21 00:00:00:900,1,1,3879.25,3879.75,3879.75


In [94]:
df1['DateTime'] = pd.to_datetime(df1['DateTime'].str.strip(), format="%Y-%m-%d %H:%M:%S:%f")

In [98]:
df1.dtypes
df1.columns

Index(['DateTime', 'Vol', 'BidAsk', 'Bid', 'Ask', 'Price'], dtype='object')

#### Creating Derivative Features

In [113]:
# Volume added in each hour   
hr_Vol = df1.resample('H', on='DateTime').Vol.sum()   #  1st derivative feature of Volume per hour

In [114]:
hr_Vol.head()    #  1st derivative feature of Volume per hour

DateTime
2022-09-21 00:00:00     3092
2022-09-21 01:00:00    14471
2022-09-21 02:00:00    42328
2022-09-21 03:00:00    31893
2022-09-21 04:00:00    18994
Freq: H, Name: Vol, dtype: int64

In [143]:
hr_Vol_BA_sum = df1.resample('H', on='DateTime').agg({'Vol':'sum', 'BidAsk':'sum'})

In [144]:
hr_Vol_BA_mean = df1.resample('H', on='DateTime').agg({'Vol':'sum', 'BidAsk':'mean'})

In [145]:
hr_Vol_BA_mean.head()          #  2st derivative feature of Volume per hour with BidAsk mean

Unnamed: 0_level_0,Vol,BidAsk
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-09-21 00:00:00,3092,-0.04531
2022-09-21 01:00:00,14471,-0.151301
2022-09-21 02:00:00,42328,-0.066566
2022-09-21 03:00:00,31893,-0.030371
2022-09-21 04:00:00,18994,0.002385


In [147]:
hr_Vol_BA_sum.shape  #   (17, 2)
hr_Vol_BA_sum.head()          #  3th derivative feature of Volume per hour with BidAsk sum

Unnamed: 0_level_0,Vol,BidAsk
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-09-21 00:00:00,3092,-114
2022-09-21 01:00:00,14471,-1715
2022-09-21 02:00:00,42328,-2372
2022-09-21 03:00:00,31893,-794
2022-09-21 04:00:00,18994,36


In [158]:
group_1 = df1.groupby([pd.Grouper(key='DateTime', freq='H'), 'BidAsk']).Vol.sum()   

In [None]:
group_1.shape  # (69,)
group_1.head(44)  #  grouped per Hr of BidAsk and Volume sum 

In [162]:
# sec_30 = df1.resample('30S', ).ffill()[0:5] 
sec_30 = df1.resample('30S', on='DateTime').agg({'Vol':'sum', 'BidAsk':'mean'})

In [176]:
sec_30.shape  #  (2040, 2)
sec_30.sample(5)

Unnamed: 0_level_0,Vol,BidAsk
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-09-21 14:35:00,10318,-0.031703
2022-09-21 01:12:30,6,-0.333333
2022-09-21 10:05:00,2353,0.034254
2022-09-21 03:43:30,100,-0.098901
2022-09-21 12:04:30,530,0.028947


In [181]:
sec_1 = df1.resample('S', on='DateTime').agg({'Vol':'sum', 'BidAsk':'mean'})

In [197]:
sec_1.shape  # (61200, 2)
sec_1.dropna(axis=0, inplace=True)
sec_1.shape   #  (41508, 2)
sec_1.head()

Unnamed: 0_level_0,Vol,BidAsk
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-09-21 00:00:00,6,0.333333
2022-09-21 00:00:03,2,-1.0
2022-09-21 00:00:07,1,1.0
2022-09-21 00:00:12,8,-0.5
2022-09-21 00:00:13,4,-1.0


In [219]:
merged_1 = df1.merge(sec_1, on='DateTime', how='left')

In [220]:
merged_1.shape   # (1742042, 8)

(1742042, 8)

In [216]:
merged_1.shape  #   (46839, 8)   right merge

(46839, 8)

In [218]:
df1.shape   # right merge

(1742042, 6)

In [221]:
merged_1.shape  #  (6747, 8)
merged_1.sample(5)

Unnamed: 0,DateTime,Vol_x,BidAsk_x,Bid,Ask,Price,Vol_y,BidAsk_y
604850,2022-09-21 13:03:21.820,1,-2,3890.75,3891.25,3890.75,,
1517738,2022-09-21 15:36:59.472,1,1,3841.0,3841.25,3841.25,,
1025926,2022-09-21 14:36:54.720,1,-1,3878.0,3878.25,3878.0,,
451104,2022-09-21 11:00:56.748,2,-1,3891.25,3891.5,3891.25,,
1375990,2022-09-21 15:15:59.444,1,-1,3891.5,3891.75,3891.5,,


In [None]:
sec_1.plot.line(y="Close", use_index=True)

In [7]:
# Changing start time for each hour, by default start time is at 0th minute
# data.resample('H', on='created_at', offset='15Min10s').price.sum().head(5) 
data.resample('H', on='created_at', offset='-15Min0s').price.sum().head(5)  #  offset can be negative  offset='-15Min10s'

created_at
2015-12-14 17:45:00    5449.90
2015-12-14 18:45:00       0.00
2015-12-14 19:45:00      74.76
2015-12-14 20:45:00       8.20
2015-12-14 21:45:00       0.00
Freq: H, Name: price, dtype: float64