### Note for reader
- This file and the report should go hand in hand.
- I tried to change a few things after making the report, and unfortunately new errors popped up.
- Most of the logic remains the same, slight updates.
- I will be uploading a fully working file to my Github repo over the weekend. 
- Here is the link: https://github.com/hmanickam13?tab=repositories

In [1]:
# Import necessary libraries
import glob
import os
import pandas as pd
from datetime import datetime

# pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

# This function reads a parquet file into a Pandas DataFrame
def readparquet(file):
  
    # Read a parquet file into a Pandas DataFrame
    df1 = pd.read_parquet(file) 

    # Get the file name without extension
    name_without_extension = os.path.splitext(os.path.basename(file))[0]

    # Print the name of the file that was read
    print("Name of file read: "+name_without_extension+"\n")

    return df1

# Reading the csv
def readcsv(file):
    # parse_dates = ['transacttime']
    df1 = pd.read_csv(file)
    # df1 = pd.read_csv(file, parse_dates=parse_dates)
    # df1 = pd.read_csv('/content/drive/MyDrive/CME/MD_ZN_20220201.csv')

    return df1

In [2]:
grouped_file_names = readcsv('grouped_file_names.csv')

# View
# grouped_file_names

In [3]:
# Function to view the list of dates that we have for each file
def manipulate_strings(file_names):
    """
    Manipulates strings in all columns of a dataframe.
    - Removes .parquet from all rows.
    - Keeps only the last 8 characters of the string in all rows.
    - Converts strings to datetime objects.

    Args:
        df (pandas.DataFrame): Input dataframe.

    Returns:
        pandas.DataFrame: Dataframe with manipulated string values.
    """
    # Copy dataframe to avoid modifying the original
    date_view = file_names.copy()

    # Iterate over columns and manipulate string values
    for col in date_view.columns:
        date_view[col] = date_view[col].apply(lambda x: str(x).replace('.parquet', '')[-8:])
        date_view[col] = pd.to_datetime(date_view[col], format='%Y%m%d', errors='coerce')

    # Sort each column individually in ascending order
    for col in date_view.columns:
        date_view[col] = date_view[col].sort_values(ascending=True).reset_index(drop=True)

    # Sort each column individually in ascending order
    for col in date_view.columns:
        file_names[col] = file_names[col].sort_values(ascending=True).reset_index(drop=True)

    return date_view, file_names

dates_for_each_type, file_names = manipulate_strings(grouped_file_names)

# View
# dates_for_each_type

# View
# file_names

# We remove the last row because we dont have the factors data for it
dates_for_each_type = dates_for_each_type.drop(index=dates_for_each_type.index[-1])
file_names = file_names.drop(index=file_names.index[-1])

# View
file_names

Unnamed: 0,ORDERBOOK,FACTORS,TIMESALES,VOLATILITY_TZ,VOLATILITY_HOUR
0,ORDERBOOK_20200129.parquet,FACTORS_20200129.parquet,TIMESALES_20200129.parquet,VOLATILITY_TZ_20200129.parquet,VOLATILITY_HOUR_20200129.parquet
1,ORDERBOOK_20200318.parquet,FACTORS_20200318.parquet,TIMESALES_20200318.parquet,VOLATILITY_TZ_20200318.parquet,VOLATILITY_HOUR_20200318.parquet
2,ORDERBOOK_20200429.parquet,FACTORS_20200429.parquet,TIMESALES_20200429.parquet,VOLATILITY_TZ_20200429.parquet,VOLATILITY_HOUR_20200429.parquet
3,ORDERBOOK_20200610.parquet,FACTORS_20200610.parquet,TIMESALES_20200610.parquet,VOLATILITY_TZ_20200610.parquet,VOLATILITY_HOUR_20200610.parquet
4,ORDERBOOK_20200729.parquet,FACTORS_20200729.parquet,TIMESALES_20200729.parquet,VOLATILITY_TZ_20200729.parquet,VOLATILITY_HOUR_20200729.parquet
5,ORDERBOOK_20200826.parquet,FACTORS_20200826.parquet,TIMESALES_20200826.parquet,VOLATILITY_TZ_20200826.parquet,VOLATILITY_HOUR_20200826.parquet
6,ORDERBOOK_20200916.parquet,FACTORS_20200916.parquet,TIMESALES_20200916.parquet,VOLATILITY_TZ_20200916.parquet,VOLATILITY_HOUR_20200916.parquet
7,ORDERBOOK_20201104.parquet,FACTORS_20201104.parquet,TIMESALES_20201104.parquet,VOLATILITY_TZ_20201104.parquet,VOLATILITY_HOUR_20201104.parquet
8,ORDERBOOK_20201216.parquet,FACTORS_20201216.parquet,TIMESALES_20201216.parquet,VOLATILITY_TZ_20201216.parquet,VOLATILITY_HOUR_20201216.parquet
9,ORDERBOOK_20210127.parquet,FACTORS_20210127.parquet,TIMESALES_20210127.parquet,VOLATILITY_TZ_20210127.parquet,VOLATILITY_HOUR_20210127.parquet


In [5]:
# Declare path variable
# path = 'Data/'
# Substrings present in the file name
# substrings = ['ORDERBOOK','FACTORS','TIMESALES','VOLATILITY_TZ','VOLATILITY_HOUR']

# separate_files_by_substrings_in_path function not in this file, check other file
# file_names = separate_files_by_substrings_in_path(path,substrings)
# file_names

In [274]:
# Sample code on how to read files
# vol_hour = readparquet(path+"/"+str(file_names['VOLATILITY_HOUR'][0]))
# vol_tz = readparquet(path+"/"+str(file_names['VOLATILITY_TZ'][0]))
# ob = readparquet(path+"/"+str(file_names['ORDERBOOK'][0]))
# factors = readparquet(path+"/"+str(file_names['FACTORS'][0]))
# timesales = readparquet(path+"/"+str(file_names['TIMESALES'][0]))

# Always delete files dataframes after use
# del vol_hour
# del vol_tz
# del ob
# del factors
# del timesales

In [4]:
# Preprocess function for orderbook data
def preprocess(df2):
   df20 = df2.copy()

   # These are the columns that I want to retain from the orderbook data
   col_names = ['transactTime', 'level_10_bid_quantity', 'level_10_bid_orders',
               'level_10_bid_price', 'level_9_bid_quantity', 'level_9_bid_orders',
               'level_9_bid_price', 'level_8_bid_quantity', 'level_8_bid_orders',
               'level_8_bid_price', 'level_7_bid_quantity', 'level_7_bid_orders',
               'level_7_bid_price', 'level_6_bid_quantity', 'level_6_bid_orders',
               'level_6_bid_price', 'level_5_bid_quantity', 'level_5_bid_orders',
               'level_5_bid_price', 'level_4_bid_quantity', 'level_4_bid_orders',
               'level_4_bid_price', 'level_3_bid_quantity', 'level_3_bid_orders',
               'level_3_bid_price', 'level_2_bid_quantity', 'level_2_bid_orders',
               'level_2_bid_price', 'level_1_bid_quantity', 'level_1_bid_orders',
               'level_1_bid_price', 'level_1_ask_price', 'level_1_ask_orders',
               'level_1_ask_quantity', 'level_2_ask_price', 'level_2_ask_orders',
               'level_2_ask_quantity', 'level_3_ask_price', 'level_3_ask_orders',
               'level_3_ask_quantity', 'level_4_ask_price', 'level_4_ask_orders',
               'level_4_ask_quantity', 'level_5_ask_price', 'level_5_ask_orders',
               'level_5_ask_quantity', 'level_6_ask_price', 'level_6_ask_orders',
               'level_6_ask_quantity', 'level_7_ask_price', 'level_7_ask_orders',
               'level_7_ask_quantity', 'level_8_ask_price', 'level_8_ask_orders',
               'level_8_ask_quantity', 'level_9_ask_price', 'level_9_ask_orders',
               'level_9_ask_quantity', 'level_10_ask_price', 'level_10_ask_orders',
               'level_10_ask_quantity']

   # Keep only above columns
   df20 = df20[col_names]

   # Rename the 'old_name' column to 'new_name'
   df20 = df20.rename(columns={'transactTime': 'transacttime'})
   
   # Extract specific characters in the string containing the time
   df20['transacttime'] = df20['transacttime'].apply(lambda x: x[:14])
   
   # Copying into another column
   df20['date'] = df20['transacttime']

   # Extracting date from new column created
   df20['date'] = pd.to_datetime(df20['date']).dt.date
   
   # Extracting time from new column created
   df20['transacttime'] = pd.to_datetime(df20['transacttime']).dt.time

   # Insert the date column at index 0
   df20.insert(0, 'date', df20.pop('date'))

   # t1 is a dataframe which has all the rows of data of df30 where level1bidprice != null
   df20 = df20[df20['level_1_bid_price'].notna()]

   # Groups the df to every second using the transacttime column
   t = df20.groupby('transacttime', as_index=False)

   # Selecting 1st row of group by result
   df20 = t.nth(0)

   # Delete the temp df
   del t
   
   # Convert column to datetime
   df20['date'] = pd.to_datetime(df20['date'])

   # define a function to combine the date and time columns into a datetime object
   def combine_date_time(row):
      my_date = row['date'].date()
      my_time = row['transacttime']
      my_datetime = datetime.combine(my_date, my_time)
      return my_datetime

   # apply the function to the 'date' and 'time' columns to create a new 'datetime' column
   df20['datetime'] = np.nan
   df20['datetime'] = df20[['date', 'transacttime']].apply(combine_date_time, axis=1)

   # Insert the date column at index 0
   df20.insert(0, 'datetime', df20.pop('datetime'))

   # Return df
   return df20

In [7]:
# preprocess function for factors data
# liang completed this Function and he uploaded the 
def preprocess_factors(factors_file):

    # Extract only required columns
    factors_file = factors_file['']

    # Extract and create date and time in our required format

# # Preprocess function for orderbook data
# def preprocess(df2):
#    df20 = df2.copy()

#    col_names = ['Trading Hours', 'Timestamp',
#                 'Net Buy', 'Mean Reversion Lag', 'Level 1 Order Flow Imbalance',
#                 'Level 2 Order Flow Imbalance', 'Level 3 Order Flow Imbalance',
#                 'Level 4 Order Flow Imbalance', 'Level 5 Order Flow Imbalance',
#                 'Level 6 Order Flow Imbalance', 'Level 7 Order Flow Imbalance',
#                 'Level 8 Order Flow Imbalance', 'Level 9 Order Flow Imbalance',
#                 'Level 10 Order Flow Imbalance'],

#    df20 = df20[col_names]

#    # Rename the 'old_name' column to 'new_name'
#    df20 = df20.rename(columns={'Timestamp': 'transacttime'})
#    df20['transacttime'] = df20['transacttime'].apply(lambda x: x[:14])

#    df20['date'] = df20['transacttime']

#    # Extracting date and time from datetime
#    df20['date'] = pd.to_datetime(df20['date']).dt.date
#    df20['transacttime'] = pd.to_datetime(df20['transacttime']).dt.time

#    # Insert the date column at index 0
#    df20.insert(0, 'date', df20.pop('date'))

#    # t1 is a dataframe which has all the rows of data of df30 where level1bidprice != null
#    df20 = df20[df20['level_1_bid_price'].notna()]

#    # Groups the df to every second using the transacttime column
#    t = df20.groupby('transacttime', as_index=False)

#    # Selecting 1st row of group by result
#    df20 = t.nth(0)

#    del t
   
#    df20['date'] = pd.to_datetime(df20['date'])

#    # define a function to combine the date and time columns into a datetime object
#    def combine_date_time(row):
#       my_date = row['date'].date()
#       my_time = row['transacttime']
#       my_datetime = datetime.combine(my_date, my_time)
#       return my_datetime


#    # apply the function to the 'date' and 'time' columns to create a new 'datetime' column
#    df20['datetime'] = np.nan
#    df20['datetime'] = df20[['date', 'transacttime']].apply(combine_date_time, axis=1)

#    # Insert the date column at index 0
#    df20.insert(0, 'datetime', df20.pop('datetime'))

#    return df20



   #  return factors_file

In [283]:
# Ignore cell

# obt['date'][0]

In [5]:
# Resamples data taking interval as 1S and forward fills the new rows
def forwardfill(df10): # takes df as input

  # Create a copy
  df20 = df10.copy()

  # Set the 'transacttime' column as the index
  df20 = df20.set_index('datetime')

  # Resample time series data and forward fill
  df20 = df20.resample('1S').ffill()

  # Reset the index
  df20 = df20.reset_index(inplace=False)

  # Extract date
  df20['date'] = df20['datetime'].dt.date

  # Extract time
  df20['transacttime'] = df20['datetime'].dt.time

  # Return df
  return df20

In [285]:
# Ignore cell

# obt

In [286]:
# Ignore cell

# # Declrate which date you want looking at the dates above
# # date = date_range_df['date'][0]
# # date
# # single_date_ob = obt[obt['date'] == date]
# # single_date_ob
# # obt1 = forwardfill(obt)
# # obt1

# unique_dates = pd.to_datetime(obt['date'].unique()).tolist()
# # unique_dates = unique_dates[1:]
# for date in unique_dates:
# # list of dates in that ob file
#     specific_date = date
#     single_date_ob = obt[obt['date'] == specific_date]
#     single_date_ob = forwardfill(single_date_ob)
    

# single_date_ob

In [287]:
# Ignore cell

# single_date_ob = forwardfill(single_date_ob)
# single_date_ob

In [288]:
# Ignore cell

# 16*60*60

In [289]:
# Ignore cell

# # Check if rows in the datetime column are in increments of 1 second
# mask = (single_date_ob['datetime'] - single_date_ob['datetime'].shift(1)).dt.seconds != 1

# # Get the rows where the datetime column is not in increments of 1 second
# result = obt[mask]
# result

In [290]:
# Ignore cell

# unique_time = single_date_ob['transacttime'].unique().tolist()

# len(unique_time)

In [291]:
# Ignore cell

# # find duplicates in the 'datetime' column
# duplicates = single_date_ob[single_date_ob.duplicated(['datetime'], keep=False)]

# # print the duplicates
# if not duplicates.empty:
#     print("Duplicates found in the 'datetime' column:")
#     print(duplicates)
# else:
#     print("No duplicates found in the 'datetime' column.")

In [292]:
# Ignore cell

# 17*60*60

In [6]:
# cycle through all the orderbook files, format and concat into single df
def master_concat():
  # Counter
  count = 0
  # For loop to iterate through all the orderbook files
  for i in range(len(file_names['ORDERBOOK'])):

    # Read file
    ob = readparquet(path+"/"+str(file_names['ORDERBOOK'][i]))

    # Preprocess the df
    ob = preprocess(ob)
    
    # Extract all the unique dates present in the df to a list
    unique_dates = pd.to_datetime(ob['date'].unique()).tolist()
    
    # For each date in the list of unique_dates in that ob file
    for date in unique_dates:

      # Assign the date to a variable
      specific_date = date
      
      # Extract the df with the single date from the list
      single_date_ob = ob[pd.to_datetime(ob['date']) == specific_date]
      
      # Print the date
      print("Date: "+str(specific_date)+"\n")
      
      # If each date has more than 10000 rows, ...
      if len(single_date_ob)>10000:
        
        # forward fill the df
        single_date_ob_fwdfill = forwardfill(single_date_ob)
        
        # If counter == 0 aka this is the first date, we create a df to store single_date_ob_fwdfill
        if count == 0:

          # Create a new df called obf
          obf = single_date_ob

          # Create a new df called ob_fwdfill
          ob_fwdfill = single_date_ob_fwdfill

          # Increment the counter
          count=count+1
        
        # If counter not 0, this is not the first date, we keep appending single_date_ob_fwdfill to ob_fwdfill
        else:

          # Concat single_date_ob to obf
          obf = pd.concat([obf, single_date_ob])

          # Concat single_date_ob_fwdfill to ob_fwdfill
          ob_fwdfill = pd.concat([ob_fwdfill, single_date_ob_fwdfill])

          # Increment counter
          count=count+1

        # Print statements to view progress of loop
        print("Calculations on "+str(specific_date)+" completed. Moving to the next date.\n")
      
      # If each date does not have more than 10000 rows, ...
      else:

        # Print statements to view progress of loop
        print("This date doesn't have enough data, so we omit it. Moving to the next date...\n")
    
    # Print statements to view progress of loop
    c = len(file_names['ORDERBOOK'])-(i+1)
    print(f"{c} more files left..\n")
  
  # Reset index which removes the extra column created during the whole process
  obf = obf.reset_index(drop=True)
  ob_fwdfill = ob_fwdfill.reset_index(drop=True)
  
  # Print statements to view progress of loop
  print("Done working on all files.\n")
  
  # Return df which is not forward filled, and df which is forward filled
  # These data frames contain all the dates combined in a single df
  return obf, ob_fwdfill

# Takes in df and list of column names to be retained and returns df with renamed column and return df with renamed column
def rename_column(df, col_names):
    """
    Renames columns in a dataframe.

    Args:
        df (pandas.DataFrame): Input dataframe.
        col_names (list): List of tuples containing existing and new column names.

    Returns:
        pandas.DataFrame: Dataframe with renamed columns.
    """
    df = df.rename(columns=dict(col_names))
    return df


In [294]:
# Running this cell would use many of the functions above and generate ob_changes and ob_fwdfill.
# ob_changes contains
# ob_fwdfill
# ob_changes, ob_fwdfill = master_concat()
# ob_fwdfill = master_concat_fwdfill()
# obt

# Write ob_changes and ob_fwdfill to CSV files
# ob_changes.to_csv('Data/ob_changes.csv', index=False)
# ob_fwdfill.to_csv('Data/ob_fwdfill.csv', index=False)

In [7]:
# Read the csv files from the directory since we already generated the files we need
ob_changes = readcsv('Data/ob_changes.csv')
ob_fwdfill = readcsv('Data/ob_fwdfill.csv')

# ob_factors has the factors calculated for every row added to ob_fwdfill
# We use this df throughout the notebook from this point onwards
ob_factors = readcsv('Data/final_merged_data.csv')

In [8]:
# List pairs of old column names and new columns names
col_names = [('Net Buy', 'netbuy'),('Mean Reversion Lag', 'mean_rev_lag'),
             ('Level 1 Order Flow Imbalance','level1ofi'),
                ('Level 2 Order Flow Imbalance','level2ofi'),
                ('Level 3 Order Flow Imbalance', 'level3ofi'),
                ('Level 4 Order Flow Imbalance', 'level4ofi'),
                ('Level 5 Order Flow Imbalance','level5ofi'),
                ('Level 6 Order Flow Imbalance','level6ofi'),
                ('Level 7 Order Flow Imbalance', 'level7ofi'),
                ('Level 8 Order Flow Imbalance', 'level8ofi'),
                ('Level 9 Order Flow Imbalance','level9ofi'),
                ('Level 10 Order Flow Imbalance','level10ofi')]

# We rename the columns in ob_factors using col_names
ob_factors = rename_column(ob_factors, col_names)

In [9]:
# View information on ob_factors
# ob_factors.info()
ob_factors.columns

Index(['Unnamed: 0', 'datetime', 'date', 'transacttime',
       'level_10_bid_quantity', 'level_10_bid_orders', 'level_10_bid_price',
       'level_9_bid_quantity', 'level_9_bid_orders', 'level_9_bid_price',
       'level_8_bid_quantity', 'level_8_bid_orders', 'level_8_bid_price',
       'level_7_bid_quantity', 'level_7_bid_orders', 'level_7_bid_price',
       'level_6_bid_quantity', 'level_6_bid_orders', 'level_6_bid_price',
       'level_5_bid_quantity', 'level_5_bid_orders', 'level_5_bid_price',
       'level_4_bid_quantity', 'level_4_bid_orders', 'level_4_bid_price',
       'level_3_bid_quantity', 'level_3_bid_orders', 'level_3_bid_price',
       'level_2_bid_quantity', 'level_2_bid_orders', 'level_2_bid_price',
       'level_1_bid_quantity', 'level_1_bid_orders', 'level_1_bid_price',
       'level_1_ask_price', 'level_1_ask_orders', 'level_1_ask_quantity',
       'level_2_ask_price', 'level_2_ask_orders', 'level_2_ask_quantity',
       'level_3_ask_price', 'level_3_ask_orders', 'l

In [10]:
# View the ob_factors df
ob_factors

Unnamed: 0.1,Unnamed: 0,datetime,date,transacttime,level_10_bid_quantity,level_10_bid_orders,level_10_bid_price,level_9_bid_quantity,level_9_bid_orders,level_9_bid_price,level_8_bid_quantity,level_8_bid_orders,level_8_bid_price,level_7_bid_quantity,level_7_bid_orders,level_7_bid_price,level_6_bid_quantity,level_6_bid_orders,level_6_bid_price,level_5_bid_quantity,level_5_bid_orders,level_5_bid_price,level_4_bid_quantity,level_4_bid_orders,level_4_bid_price,level_3_bid_quantity,level_3_bid_orders,level_3_bid_price,level_2_bid_quantity,level_2_bid_orders,level_2_bid_price,level_1_bid_quantity,level_1_bid_orders,level_1_bid_price,level_1_ask_price,level_1_ask_orders,level_1_ask_quantity,level_2_ask_price,level_2_ask_orders,level_2_ask_quantity,level_3_ask_price,level_3_ask_orders,level_3_ask_quantity,level_4_ask_price,level_4_ask_orders,level_4_ask_quantity,level_5_ask_price,level_5_ask_orders,level_5_ask_quantity,level_6_ask_price,level_6_ask_orders,level_6_ask_quantity,level_7_ask_price,level_7_ask_orders,level_7_ask_quantity,level_8_ask_price,level_8_ask_orders,level_8_ask_quantity,level_9_ask_price,level_9_ask_orders,level_9_ask_quantity,level_10_ask_price,level_10_ask_orders,level_10_ask_quantity,netbuy,mean_rev_lag,level1ofi,level2ofi,level3ofi,level4ofi,level5ofi,level6ofi,level7ofi,level8ofi,level9ofi,level10ofi,midprice,delp,prev_midprice
0,0,2020-01-28 17:00:00,2020-01-28,17:00:00,2103.0,63.0,130.359375,3535.0,66.0,130.375000,1806.0,67.0,130.390625,2933.0,64.0,130.406250,1708.0,59.0,130.421875,1297.0,49.0,130.437500,1150.0,34.0,130.453125,449.0,24.0,130.468750,377.0,16.0,130.484375,344.0,11.0,130.500000,130.515625,13.0,257.0,130.531250,26.0,499.0,130.546875,27.0,621.0,130.562500,68.0,1198.0,130.578125,80.0,1671.0,130.593750,80.0,1837.0,130.609375,73.0,1788.0,130.625000,73.0,1864.0,130.640625,79.0,1808.0,130.656250,87.0,2238.0,66.0,,271.0,87.0,238.0,77.0,8.0,-22.0,20.0,14.0,16.0,0.0,130.507812,,
1,1,2020-01-28 17:00:01,2020-01-28,17:00:01,2103.0,63.0,130.359375,3551.0,68.0,130.375000,1820.0,69.0,130.390625,2953.0,69.0,130.406250,1719.0,61.0,130.421875,1305.0,50.0,130.437500,1241.0,45.0,130.453125,910.0,34.0,130.468750,588.0,21.0,130.484375,516.0,17.0,130.500000,130.515625,8.0,158.0,130.531250,30.0,623.0,130.546875,43.0,844.0,130.562500,45.0,1212.0,130.578125,80.0,1671.0,130.593750,89.0,1860.0,130.609375,73.0,1788.0,130.625000,73.0,1864.0,130.640625,79.0,1808.0,130.656250,87.0,2238.0,174.0,0.000000e+00,148.0,90.0,222.0,349.0,333.0,108.0,2.0,-31.0,0.0,16.0,130.507812,0.000000,130.507812
2,2,2020-01-28 17:00:02,2020-01-28,17:00:02,2119.0,65.0,130.359375,3551.0,68.0,130.375000,1820.0,69.0,130.390625,2986.0,74.0,130.406250,1742.0,65.0,130.421875,1359.0,53.0,130.437500,1296.0,49.0,130.453125,985.0,39.0,130.468750,661.0,25.0,130.484375,529.0,21.0,130.500000,130.515625,6.0,21.0,130.531250,34.0,683.0,130.546875,45.0,876.0,130.562500,46.0,1127.0,130.578125,78.0,1424.0,130.593750,79.0,1785.0,130.609375,77.0,1819.0,130.625000,77.0,1895.0,130.640625,79.0,1808.0,130.656250,87.0,2238.0,64.0,0.000000e+00,130.0,1418.0,1956.0,2389.0,2893.0,3339.0,3587.0,4879.0,3628.0,5840.0,130.507812,0.000000,130.507812
3,3,2020-01-28 17:00:03,2020-01-28,17:00:03,3551.0,68.0,130.375000,1820.0,69.0,130.390625,2984.0,73.0,130.406250,1763.0,67.0,130.421875,1534.0,60.0,130.437500,1337.0,51.0,130.453125,1256.0,50.0,130.468750,1044.0,40.0,130.484375,583.0,30.0,130.500000,122.0,12.0,130.515625,130.531250,43.0,698.0,130.546875,44.0,846.0,130.562500,48.0,1115.0,130.578125,78.0,1381.0,130.593750,75.0,1650.0,130.609375,77.0,1819.0,130.625000,72.0,1892.0,130.640625,79.0,1808.0,130.656250,87.0,2238.0,130.671875,62.0,2236.0,-4.0,8.000000e-01,539.0,102.0,-38.0,-15.0,2.0,37.0,-1.0,3.0,3.0,0.0,130.523438,0.015625,130.507812
4,4,2020-01-28 17:00:04,2020-01-28,17:00:04,3569.0,70.0,130.375000,1839.0,72.0,130.390625,3003.0,76.0,130.406250,1782.0,70.0,130.421875,1584.0,67.0,130.437500,1356.0,54.0,130.453125,1261.0,51.0,130.468750,1026.0,39.0,130.484375,708.0,33.0,130.500000,662.0,14.0,130.515625,130.531250,44.0,695.0,130.546875,49.0,869.0,130.562500,51.0,1135.0,130.578125,81.0,1401.0,130.593750,78.0,1669.0,130.609375,74.0,1832.0,130.625000,74.0,1910.0,130.640625,79.0,1824.0,130.656250,87.0,2254.0,130.671875,64.0,2254.0,0.0,6.400000e-01,-29.0,10.0,-497.0,1.0,-25.0,6.0,0.0,0.0,0.0,0.0,130.523438,0.000000,130.523438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199795,1199795,2021-11-03 15:59:56,2021-11-03,15:59:56,2276.0,93.0,130.453125,2162.0,113.0,130.468750,2063.0,128.0,130.484375,2004.0,104.0,130.500000,1797.0,108.0,130.515625,1987.0,108.0,130.531250,1688.0,106.0,130.546875,1827.0,66.0,130.562500,1653.0,80.0,130.578125,323.0,21.0,130.593750,130.609375,32.0,843.0,130.625000,43.0,1194.0,130.640625,51.0,1425.0,130.656250,132.0,1480.0,130.671875,128.0,1720.0,130.687500,106.0,1750.0,130.703125,100.0,1905.0,130.718750,96.0,1906.0,130.734375,104.0,2032.0,130.750000,101.0,2216.0,-30.0,-5.968559e-13,-121.0,-50.0,-29.0,0.0,-78.0,1.0,14.0,-4.0,8.0,2.0,130.601562,0.000000,130.601562
1199796,1199796,2021-11-03 15:59:57,2021-11-03,15:59:57,2201.0,92.0,130.453125,2087.0,112.0,130.468750,1984.0,123.0,130.484375,1927.0,101.0,130.500000,1708.0,102.0,130.515625,1893.0,102.0,130.531250,1672.0,100.0,130.546875,1786.0,60.0,130.562500,1606.0,68.0,130.578125,251.0,16.0,130.593750,130.609375,29.0,894.0,130.625000,41.0,1202.0,130.640625,49.0,1413.0,130.656250,126.0,1464.0,130.671875,121.0,1704.0,130.687500,101.0,1660.0,130.703125,92.0,1814.0,130.718750,95.0,1831.0,130.734375,95.0,1949.0,130.750000,98.0,2139.0,0.0,-4.760636e-13,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.601562,0.000000,130.601562
1199797,1199797,2021-11-03 15:59:58,2021-11-03,15:59:58,2201.0,92.0,130.453125,2087.0,112.0,130.468750,1984.0,123.0,130.484375,1927.0,101.0,130.500000,1708.0,102.0,130.515625,1893.0,102.0,130.531250,1672.0,100.0,130.546875,1786.0,60.0,130.562500,1602.0,67.0,130.578125,250.0,15.0,130.593750,130.609375,29.0,894.0,130.625000,41.0,1202.0,130.640625,49.0,1413.0,130.656250,126.0,1464.0,130.671875,121.0,1704.0,130.687500,101.0,1660.0,130.703125,92.0,1814.0,130.718750,95.0,1831.0,130.734375,95.0,1949.0,130.750000,98.0,2139.0,100.0,-3.801404e-13,-21.0,-3.0,26.0,0.0,0.0,0.0,-5.0,0.0,0.0,-1.0,130.601562,0.000000,130.601562
1199798,1199798,2021-11-03 15:59:59,2021-11-03,15:59:59,2180.0,90.0,130.453125,2067.0,111.0,130.468750,1952.0,119.0,130.484375,1890.0,96.0,130.500000,1676.0,98.0,130.515625,1861.0,97.0,130.531250,1640.0,96.0,130.546875,1772.0,57.0,130.562500,1570.0,61.0,130.578125,105.0,11.0,130.593750,130.609375,30.0,770.0,130.625000,37.0,1170.0,130.640625,45.0,1373.0,130.656250,122.0,1432.0,130.671875,117.0,1672.0,130.687500,97.0,1628.0,130.703125,88.0,1782.0,130.718750,91.0,1799.0,130.734375,94.0,1929.0,130.750000,97.0,2119.0,0.0,-3.055334e-13,72.0,-76.0,-18.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,130.601562,0.000000,130.601562


In [299]:
# Ignore cell

# View ob_changes
# ob_changes

In [300]:
# Ignore cell

# View ob_fwdfill
# ob_fwdfill

# View 0th element in datetime column in ob_fwdfill
# ob_fwdfill['datetime'][0]

In [301]:
# Ignore cell

# unique_times = obt['transacttime'].unique()
# unique_times

# unique_dates = ob_fwdfill['date'].unique()
# unique_dates[0]

In [11]:
# Function which returns a df which contains information on the unique dates and counts of rows for each date
def get_date_range(df3):
  
  # Create a copy
  df = df3.copy()

  # Find all the unique dates in the date column
  unique_dates = df['date'].unique()

  # Initialize variables
  date_range = []
  date_range_dict = []

  # For unique date
  for date in unique_dates:

    # Extract the df which contains the unique date
    date_ob = df[df['date'] == date]
    
    # Reset index becuse we need it to start from 0
    date_ob = date_ob.reset_index(inplace=False)
    
    # Extract start time
    min_time = date_ob['transacttime'].min()
    
    # Extract end time
    max_time = date_ob['transacttime'].max()
    
    # Extract list of unique transacttimes
    unique_times = date_ob['transacttime'].unique()

    # Append a dict containing the date, start and end times for that date and the total # of rows in the dataframe
    date_range.append({'date': date, 'min_time': min_time, 'max_time': max_time, 'no_of_ob_updates': len(unique_times)})
    
    # get the unique transaction times for the current date as a list
    # unique_times2 = date_ob['transacttime'].unique().tolist()

    # create a dictionary for the current date and add it to the date_range list
    # date_range_dict.append({'date': date, 'min_time': min_time, 'max_time': max_time, 'no_of_ob_updates': len(unique_times)})

  # convert the date range to a df
  date_range_df = pd.DataFrame(date_range)

  # Return the date_range_df we created
  return date_range_df#, date_range_dict


In [303]:
# Ignore cell
# obt

In [304]:
# Ignore cell
# date_range_df['unique_times'][0]

In [19]:
# We don't run this cell but reaad through for better understanding of code

# We use the get_date_range function to view a summary of the large dataframes we have.
# Summarizing it helps us take decisions on how to split our data between training testing and validation.

# Call the function and generate the df
# date_range_ob_changes_df = get_date_range(ob_changes)
# date_range_ob_fwdfill_df = get_date_range(ob_fwdfill)
date_range_ob_factors_df = get_date_range(ob_factors)

# View the summary dataframe we created
# date_range_ob_changes
# date_range_ob_fwdfill
# date_range_ob_factors_df

# We write the generated summary dfs to the current directory as CSV files 
# Write to a csv file
# date_range_ob_changes_df.to_csv('Data/date_range_ob_changes_df.csv', index=False)
# date_range_ob_fwdfill_df.to_csv('Data/date_range_ob_fwdfill_df.csv', index=False)
# date_range_ob_factors_df.to_csv('Data/date_range_ob_factors_df.csv', index=False)

# This is how we access the unique time if we had not generated the summary as a dataframe.
# In the dicitoonary, the unique_times key had a list as its value which contained all the unique times of that date in the ob_factors df we are using
# We don't add that key and value to the summary df because each element in a df cannot be a list
# date_range_ob_changes['unique_times'][0][0]


In [15]:
# Read the summary CSV files we already generated
# date_range_ob_changes_df = readcsv('Data/date_range_ob_changes_df.csv')
# date_range_ob_fwdfill_df = readcsv('Data/date_range_ob_fwdfill_df.csv')
date_range_ob_factors_df = readcsv('Data/date_range_ob_factors_df.csv')
# Did i use the date_range_ob_fwdfill_df to access ob_factors???? 
# If i did do I need to change?

# View
# date_range_ob_changes_df
# date_range_ob_fwdfill_df
date_range_ob_factors_df

Unnamed: 0,date,min_time,max_time,no_of_ob_updates
0,2020-01-28,17:00:00,23:59:59,25200
1,2020-01-29,00:00:00,16:00:00,57601
2,2020-03-17,17:00:00,23:59:57,25198
3,2020-03-18,00:00:00,16:00:00,57601
4,2020-04-29,00:00:00,16:00:00,57601
5,2020-06-09,17:00:00,23:59:59,25200
6,2020-06-10,00:00:00,16:00:00,57601
7,2020-07-28,17:00:00,23:59:57,25198
8,2020-07-29,00:00:00,16:00:00,57601
9,2020-08-25,17:00:00,23:59:59,25200


In [16]:
# Function which returns a slice of the big df we use with the start and end date and times we provide
def window_slicer(df, start_datetime, end_datetime):

    # Create a copy
    t1 = df.copy()
    
    # filter the df
    t1 = t1.loc[(t1['datetime'] >= start_datetime) & (t1['datetime'] < end_datetime)]

    # Drop the extra column created in front
    t1 = t1.drop(t1.columns[0], axis=1)

    # Return the sliced dataframe
    return t1 

In [17]:
# Select time window of 2 hours. Example '2020-01-28 16:45:23'
start_time = '2020-01-28 17:00:00'
end_time = '2020-01-28 19:00:00'

# Call window slicer function
sliced_data = window_slicer(ob_factors, start_time, end_time)

# Reset index because a new column is created in the front
sliced_data = sliced_data.reset_index(drop=True)
# sliced_data = sliced_data.drop(sliced_data.columns[0], axis=1)

# View sliced df
sliced_data

# View 0th element of datetime column
# sliced_data['datetime'][0]

Unnamed: 0,datetime,date,transacttime,level_10_bid_quantity,level_10_bid_orders,level_10_bid_price,level_9_bid_quantity,level_9_bid_orders,level_9_bid_price,level_8_bid_quantity,level_8_bid_orders,level_8_bid_price,level_7_bid_quantity,level_7_bid_orders,level_7_bid_price,level_6_bid_quantity,level_6_bid_orders,level_6_bid_price,level_5_bid_quantity,level_5_bid_orders,level_5_bid_price,level_4_bid_quantity,level_4_bid_orders,level_4_bid_price,level_3_bid_quantity,level_3_bid_orders,level_3_bid_price,level_2_bid_quantity,level_2_bid_orders,level_2_bid_price,level_1_bid_quantity,level_1_bid_orders,level_1_bid_price,level_1_ask_price,level_1_ask_orders,level_1_ask_quantity,level_2_ask_price,level_2_ask_orders,level_2_ask_quantity,level_3_ask_price,level_3_ask_orders,level_3_ask_quantity,level_4_ask_price,level_4_ask_orders,level_4_ask_quantity,level_5_ask_price,level_5_ask_orders,level_5_ask_quantity,level_6_ask_price,level_6_ask_orders,level_6_ask_quantity,level_7_ask_price,level_7_ask_orders,level_7_ask_quantity,level_8_ask_price,level_8_ask_orders,level_8_ask_quantity,level_9_ask_price,level_9_ask_orders,level_9_ask_quantity,level_10_ask_price,level_10_ask_orders,level_10_ask_quantity,netbuy,mean_rev_lag,level1ofi,level2ofi,level3ofi,level4ofi,level5ofi,level6ofi,level7ofi,level8ofi,level9ofi,level10ofi,midprice,delp,prev_midprice
0,2020-01-28 17:00:00,2020-01-28,17:00:00,2103.0,63.0,130.359375,3535.0,66.0,130.375000,1806.0,67.0,130.390625,2933.0,64.0,130.406250,1708.0,59.0,130.421875,1297.0,49.0,130.437500,1150.0,34.0,130.453125,449.0,24.0,130.468750,377.0,16.0,130.484375,344.0,11.0,130.500000,130.515625,13.0,257.0,130.531250,26.0,499.0,130.546875,27.0,621.0,130.562500,68.0,1198.0,130.578125,80.0,1671.0,130.593750,80.0,1837.0,130.609375,73.0,1788.0,130.625000,73.0,1864.0,130.640625,79.0,1808.0,130.656250,87.0,2238.0,66.0,,271.0,87.0,238.0,77.0,8.0,-22.0,20.0,14.0,16.0,0.0,130.507812,,
1,2020-01-28 17:00:01,2020-01-28,17:00:01,2103.0,63.0,130.359375,3551.0,68.0,130.375000,1820.0,69.0,130.390625,2953.0,69.0,130.406250,1719.0,61.0,130.421875,1305.0,50.0,130.437500,1241.0,45.0,130.453125,910.0,34.0,130.468750,588.0,21.0,130.484375,516.0,17.0,130.500000,130.515625,8.0,158.0,130.531250,30.0,623.0,130.546875,43.0,844.0,130.562500,45.0,1212.0,130.578125,80.0,1671.0,130.593750,89.0,1860.0,130.609375,73.0,1788.0,130.625000,73.0,1864.0,130.640625,79.0,1808.0,130.656250,87.0,2238.0,174.0,0.000000,148.0,90.0,222.0,349.0,333.0,108.0,2.0,-31.0,0.0,16.0,130.507812,0.000000,130.507812
2,2020-01-28 17:00:02,2020-01-28,17:00:02,2119.0,65.0,130.359375,3551.0,68.0,130.375000,1820.0,69.0,130.390625,2986.0,74.0,130.406250,1742.0,65.0,130.421875,1359.0,53.0,130.437500,1296.0,49.0,130.453125,985.0,39.0,130.468750,661.0,25.0,130.484375,529.0,21.0,130.500000,130.515625,6.0,21.0,130.531250,34.0,683.0,130.546875,45.0,876.0,130.562500,46.0,1127.0,130.578125,78.0,1424.0,130.593750,79.0,1785.0,130.609375,77.0,1819.0,130.625000,77.0,1895.0,130.640625,79.0,1808.0,130.656250,87.0,2238.0,64.0,0.000000,130.0,1418.0,1956.0,2389.0,2893.0,3339.0,3587.0,4879.0,3628.0,5840.0,130.507812,0.000000,130.507812
3,2020-01-28 17:00:03,2020-01-28,17:00:03,3551.0,68.0,130.375000,1820.0,69.0,130.390625,2984.0,73.0,130.406250,1763.0,67.0,130.421875,1534.0,60.0,130.437500,1337.0,51.0,130.453125,1256.0,50.0,130.468750,1044.0,40.0,130.484375,583.0,30.0,130.500000,122.0,12.0,130.515625,130.531250,43.0,698.0,130.546875,44.0,846.0,130.562500,48.0,1115.0,130.578125,78.0,1381.0,130.593750,75.0,1650.0,130.609375,77.0,1819.0,130.625000,72.0,1892.0,130.640625,79.0,1808.0,130.656250,87.0,2238.0,130.671875,62.0,2236.0,-4.0,0.800000,539.0,102.0,-38.0,-15.0,2.0,37.0,-1.0,3.0,3.0,0.0,130.523438,0.015625,130.507812
4,2020-01-28 17:00:04,2020-01-28,17:00:04,3569.0,70.0,130.375000,1839.0,72.0,130.390625,3003.0,76.0,130.406250,1782.0,70.0,130.421875,1584.0,67.0,130.437500,1356.0,54.0,130.453125,1261.0,51.0,130.468750,1026.0,39.0,130.484375,708.0,33.0,130.500000,662.0,14.0,130.515625,130.531250,44.0,695.0,130.546875,49.0,869.0,130.562500,51.0,1135.0,130.578125,81.0,1401.0,130.593750,78.0,1669.0,130.609375,74.0,1832.0,130.625000,74.0,1910.0,130.640625,79.0,1824.0,130.656250,87.0,2254.0,130.671875,64.0,2254.0,0.0,0.640000,-29.0,10.0,-497.0,1.0,-25.0,6.0,0.0,0.0,0.0,0.0,130.523438,0.000000,130.523438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7195,2020-01-28 18:59:55,2020-01-28,18:59:55,3122.0,89.0,130.406250,1973.0,90.0,130.421875,1904.0,89.0,130.437500,1865.0,94.0,130.453125,1879.0,86.0,130.468750,1865.0,99.0,130.484375,2151.0,143.0,130.500000,2223.0,119.0,130.515625,1870.0,104.0,130.531250,1349.0,73.0,130.546875,130.562500,29.0,274.0,130.578125,100.0,1220.0,130.593750,124.0,2541.0,130.609375,134.0,1857.0,130.625000,124.0,2445.0,130.640625,103.0,2120.0,130.656250,115.0,2623.0,130.671875,98.0,2510.0,130.687500,114.0,2411.0,130.703125,91.0,1907.0,0.0,-0.167772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.554688,0.000000,130.554688
7196,2020-01-28 18:59:56,2020-01-28,18:59:56,3122.0,89.0,130.406250,1973.0,90.0,130.421875,1904.0,89.0,130.437500,1865.0,94.0,130.453125,1879.0,86.0,130.468750,1865.0,99.0,130.484375,2151.0,143.0,130.500000,2223.0,119.0,130.515625,1870.0,104.0,130.531250,1349.0,73.0,130.546875,130.562500,29.0,274.0,130.578125,100.0,1220.0,130.593750,124.0,2541.0,130.609375,134.0,1857.0,130.625000,124.0,2445.0,130.640625,103.0,2120.0,130.656250,115.0,2623.0,130.671875,98.0,2510.0,130.687500,114.0,2411.0,130.703125,91.0,1907.0,0.0,-0.134218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.554688,0.000000,130.554688
7197,2020-01-28 18:59:57,2020-01-28,18:59:57,3122.0,89.0,130.406250,1973.0,90.0,130.421875,1904.0,89.0,130.437500,1865.0,94.0,130.453125,1879.0,86.0,130.468750,1865.0,99.0,130.484375,2151.0,143.0,130.500000,2223.0,119.0,130.515625,1870.0,104.0,130.531250,1349.0,73.0,130.546875,130.562500,29.0,274.0,130.578125,100.0,1220.0,130.593750,124.0,2541.0,130.609375,134.0,1857.0,130.625000,124.0,2445.0,130.640625,103.0,2120.0,130.656250,115.0,2623.0,130.671875,98.0,2510.0,130.687500,114.0,2411.0,130.703125,91.0,1907.0,0.0,-0.107374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.554688,0.000000,130.554688
7198,2020-01-28 18:59:58,2020-01-28,18:59:58,3122.0,89.0,130.406250,1973.0,90.0,130.421875,1904.0,89.0,130.437500,1865.0,94.0,130.453125,1879.0,86.0,130.468750,1865.0,99.0,130.484375,2151.0,143.0,130.500000,2223.0,119.0,130.515625,1870.0,104.0,130.531250,1349.0,73.0,130.546875,130.562500,29.0,274.0,130.578125,100.0,1220.0,130.593750,124.0,2541.0,130.609375,134.0,1857.0,130.625000,124.0,2445.0,130.640625,103.0,2120.0,130.656250,115.0,2623.0,130.671875,98.0,2510.0,130.687500,114.0,2411.0,130.703125,91.0,1907.0,0.0,-0.085899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.554688,0.000000,130.554688


In [20]:
# Extract the snapshot from the orderbook and send as an ordered dict
def send_order_book_snapshot(filtered_df, row_number, levels):
    """
    This function creates an order book snapshot for a given row number in a DataFrame.
    The snapshot includes the top N levels of the order book on both the bid and ask sides.
    The snapshot is returned as a dictionary with the following keys:
    - 'bids': a list of bid levels, where each level is a dictionary with the keys 'price', 'quantity', and 'orders'
    - 'asks': a list of ask levels, where each level is a dictionary with the keys 'price', 'quantity', and 'orders'
    - 'transacttime': the transaction time for the snapshot

    :param order_book: the DataFrame containing the order book data
    :param row_number: the row number for which to create the snapshot
    :param levels: the number of levels to include in the snapshot
    :param transacttime: the transaction time for the snapshot
    :return: a dictionary containing the order book snapshot
    """
    # order_book = filtered_df.copy()
    # Initialize lists for the bids and asks
    bids = []
    asks = []
    factors = {}
    transacttime = filtered_df['transacttime'][row_number]
    
    # Iterate over the desired number of levels and extract the bid and ask data
    for i in range(1, levels+1):
        # Extract the bid data for the current level and append it to the bids list
        bid = {
            'price': filtered_df[f'level_{i}_bid_price'].iloc[row_number],
            'quantity': filtered_df[f'level_{i}_bid_quantity'].iloc[row_number],
            'orders': filtered_df[f'level_{i}_bid_orders'].iloc[row_number]
        }
        bids.append(bid)

        # Extract the ask data for the current level and append it to the asks list
        ask = {
            'price': filtered_df[f'level_{i}_ask_price'].iloc[row_number],
            'quantity': filtered_df[f'level_{i}_ask_quantity'].iloc[row_number],
            'orders': filtered_df[f'level_{i}_ask_orders'].iloc[row_number]
        }
        asks.append(ask)

    # Extract all the factors for that second
    factors = {
        'netbuy': filtered_df['netbuy'].iloc[row_number],
        'mean_rev_lag': filtered_df['mean_rev_lag'].iloc[row_number],
        # 'midprice': filtered_df['midprice'].iloc[row_number],
        'delp': filtered_df['delp'].iloc[row_number],
        'level1ofi': filtered_df['level1ofi'].iloc[row_number],
        'level2ofi': filtered_df['level2ofi'].iloc[row_number],
        'level3ofi': filtered_df['level3ofi'].iloc[row_number],
        'level4ofi': filtered_df['level4ofi'].iloc[row_number],
        'level5ofi': filtered_df['level5ofi'].iloc[row_number],
        'level6ofi': filtered_df['level6ofi'].iloc[row_number],
        'level7ofi': filtered_df['level7ofi'].iloc[row_number],
        'level8ofi': filtered_df['level8ofi'].iloc[row_number],
        'level9ofi': filtered_df['level9ofi'].iloc[row_number],
        'level10ofi': filtered_df['level10ofi'].iloc[row_number],
    }

# Create the order book snapshot dictionary with bids, asks, and transacttime
    order_book_snapshot = {'bids': bids, 'asks': asks, 'factors': factors, 'transacttime': transacttime}

    # Return the order book snapshot dictionary
    return order_book_snapshot

In [21]:
# View the 1st snapshot of the sliced data
order_book_snapshot = send_order_book_snapshot(sliced_data, 1, 10)
order_book_snapshot

{'bids': [{'price': 130.5, 'quantity': 516.0, 'orders': 17.0},
  {'price': 130.484375, 'quantity': 588.0, 'orders': 21.0},
  {'price': 130.46875, 'quantity': 910.0, 'orders': 34.0},
  {'price': 130.453125, 'quantity': 1241.0, 'orders': 45.0},
  {'price': 130.4375, 'quantity': 1305.0, 'orders': 50.0},
  {'price': 130.421875, 'quantity': 1719.0, 'orders': 61.0},
  {'price': 130.40625, 'quantity': 2953.0, 'orders': 69.0},
  {'price': 130.390625, 'quantity': 1820.0, 'orders': 69.0},
  {'price': 130.375, 'quantity': 3551.0, 'orders': 68.0},
  {'price': 130.359375, 'quantity': 2103.0, 'orders': 63.0}],
 'asks': [{'price': 130.515625, 'quantity': 158.0, 'orders': 8.0},
  {'price': 130.53125, 'quantity': 623.0, 'orders': 30.0},
  {'price': 130.546875, 'quantity': 844.0, 'orders': 43.0},
  {'price': 130.5625, 'quantity': 1212.0, 'orders': 45.0},
  {'price': 130.578125, 'quantity': 1671.0, 'orders': 80.0},
  {'price': 130.59375, 'quantity': 1860.0, 'orders': 89.0},
  {'price': 130.609375, 'quant

In [27]:
# Example on how to access the Level 1 Ask price in order_book_snapshot
# order_book_snapshot['asks'][0]['price']

In [28]:
# View the 2nd snapshot of the sliced data
order_book_snapshot2 = send_order_book_snapshot(sliced_data, 2, 10)
order_book_snapshot2

{'bids': [{'price': 130.5, 'quantity': 529.0, 'orders': 21.0},
  {'price': 130.484375, 'quantity': 661.0, 'orders': 25.0},
  {'price': 130.46875, 'quantity': 985.0, 'orders': 39.0},
  {'price': 130.453125, 'quantity': 1296.0, 'orders': 49.0},
  {'price': 130.4375, 'quantity': 1359.0, 'orders': 53.0},
  {'price': 130.421875, 'quantity': 1742.0, 'orders': 65.0},
  {'price': 130.40625, 'quantity': 2986.0, 'orders': 74.0},
  {'price': 130.390625, 'quantity': 1820.0, 'orders': 69.0},
  {'price': 130.375, 'quantity': 3551.0, 'orders': 68.0},
  {'price': 130.359375, 'quantity': 2119.0, 'orders': 65.0}],
 'asks': [{'price': 130.515625, 'quantity': 21.0, 'orders': 6.0},
  {'price': 130.53125, 'quantity': 683.0, 'orders': 34.0},
  {'price': 130.546875, 'quantity': 876.0, 'orders': 45.0},
  {'price': 130.5625, 'quantity': 1127.0, 'orders': 46.0},
  {'price': 130.578125, 'quantity': 1424.0, 'orders': 78.0},
  {'price': 130.59375, 'quantity': 1785.0, 'orders': 79.0},
  {'price': 130.609375, 'quanti

In [29]:
def order_book_snapshot_generator(filtered_df, levels):
    """
    This is a generator function that iterates through the rows of a filtered order book DataFrame
    and yields the order book snapshot for each row using the send_order_book_snapshot function.
    
    :param filtered_df: the filtered order book DataFrame
    :param levels: the number of levels to include in each snapshot
    """
    for i in range(len(filtered_df)):
        yield send_order_book_snapshot(filtered_df, i, levels)

In [30]:
# Find the change in the previous and current snapshot of the orderbook
def find_order_book_changes(order_book_snapshot1, order_book_snapshot2):
    
    start_time = order_book_snapshot1['transacttime']
    # start_time = datetime.strptime(start_time[:14], '%Y%m%d%H%M%S')

    end_time = order_book_snapshot2['transacttime']
    # end_time = datetime.strptime(end_time[:14], '%Y%m%d%H%M%S')
    
    # Create an empty dictionary to store the changes
    diff = {'bids': [], 'asks': [], 'start_time': start_time, 'end_time': end_time}


    # Loop through the bids in the first order book snapshot
    for i in range(len(order_book_snapshot1['bids'])):
        # Create an empty dictionary to store the differences for this bid
        bid_diff = {}
        # Calculate the difference in price for this bid level between the two snapshots
        bid_diff['price'] = order_book_snapshot2['bids'][i]['price'] - order_book_snapshot1['bids'][i]['price']
        # Calculate the difference in quantity for this bid level between the two snapshots
        bid_diff['quantity'] = order_book_snapshot2['bids'][i]['quantity'] - order_book_snapshot1['bids'][i]['quantity']
        # Calculate the difference in number of orders for this bid level between the two snapshots
        bid_diff['orders'] = order_book_snapshot2['bids'][i]['orders'] - order_book_snapshot1['bids'][i]['orders']
        # Add this bid level's differences to the list of bid differences
        diff['bids'].append(bid_diff)

    # Loop through the asks in the first order book snapshot
    for i in range(len(order_book_snapshot1['asks'])):
        # Create an empty dictionary to store the differences for this ask
        ask_diff = {}
        # Calculate the difference in price for this ask level between the two snapshots
        ask_diff['price'] = order_book_snapshot2['asks'][i]['price'] - order_book_snapshot1['asks'][i]['price']
        # Calculate the difference in quantity for this ask level between the two snapshots
        ask_diff['quantity'] = order_book_snapshot2['asks'][i]['quantity'] - order_book_snapshot1['asks'][i]['quantity']
        # Calculate the difference in number of orders for this ask level between the two snapshots
        ask_diff['orders'] = order_book_snapshot2['asks'][i]['orders'] - order_book_snapshot1['asks'][i]['orders']
        # Add this ask level's differences to the list of ask differences
        diff['asks'].append(ask_diff)

    # Return the dictionary of changes
    return diff



# Find the addition of 2 snapshots of the orderbook
def find_order_book_addition(order_book_snapshot1, order_book_snapshot2):
    
    start_time = order_book_snapshot1['transacttime']
    # start_time = datetime.strptime(start_time[:14], '%Y%m%d%H%M%S')

    end_time = order_book_snapshot2['transacttime']
    # end_time = datetime.strptime(end_time[:14], '%Y%m%d%H%M%S')
    
    # Create an empty dictionary to store the changes
    diff = {'bids': [], 'asks': [], 'start_time': start_time, 'end_time': end_time}


    # Loop through the bids in the first order book snapshot
    for i in range(len(order_book_snapshot1['bids'])):
        # Create an empty dictionary to store the differences for this bid
        bid_diff = {}
        # Calculate the difference in price for this bid level between the two snapshots
        bid_diff['price'] = 0#order_book_snapshot2['bids'][i]['price'] + order_book_snapshot1['bids'][i]['price']
        # Calculate the difference in quantity for this bid level between the two snapshots
        bid_diff['quantity'] = order_book_snapshot2['bids'][i]['quantity'] + order_book_snapshot1['bids'][i]['quantity']
        # Calculate the difference in number of orders for this bid level between the two snapshots
        bid_diff['orders'] = 0#order_book_snapshot2['bids'][i]['orders'] + order_book_snapshot1['bids'][i]['orders']
        # Add this bid level's differences to the list of bid differences
        diff['bids'].append(bid_diff)

    # Loop through the asks in the first order book snapshot
    for i in range(len(order_book_snapshot1['asks'])):
        # Create an empty dictionary to store the differences for this ask
        ask_diff = {}
        # Calculate the difference in price for this ask level between the two snapshots
        ask_diff['price'] = 0#order_book_snapshot2['asks'][i]['price'] + order_book_snapshot1['asks'][i]['price']
        # Calculate the difference in quantity for this ask level between the two snapshots
        ask_diff['quantity'] = order_book_snapshot2['asks'][i]['quantity'] + order_book_snapshot1['asks'][i]['quantity']
        # Calculate the difference in number of orders for this ask level between the two snapshots
        ask_diff['orders'] = 0#order_book_snapshot2['asks'][i]['orders'] + order_book_snapshot1['asks'][i]['orders']
        # Add this ask level's differences to the list of ask differences
        diff['asks'].append(ask_diff)

    # Return the dictionary of changes
    return diff


In [31]:
changes_ob = find_order_book_changes(order_book_snapshot, order_book_snapshot2)
changes_ob

{'bids': [{'price': 0.0, 'quantity': 13.0, 'orders': 4.0},
  {'price': 0.0, 'quantity': 73.0, 'orders': 4.0},
  {'price': 0.0, 'quantity': 75.0, 'orders': 5.0},
  {'price': 0.0, 'quantity': 55.0, 'orders': 4.0},
  {'price': 0.0, 'quantity': 54.0, 'orders': 3.0},
  {'price': 0.0, 'quantity': 23.0, 'orders': 4.0},
  {'price': 0.0, 'quantity': 33.0, 'orders': 5.0},
  {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
  {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
  {'price': 0.0, 'quantity': 16.0, 'orders': 2.0}],
 'asks': [{'price': 0.0, 'quantity': -137.0, 'orders': -2.0},
  {'price': 0.0, 'quantity': 60.0, 'orders': 4.0},
  {'price': 0.0, 'quantity': 32.0, 'orders': 2.0},
  {'price': 0.0, 'quantity': -85.0, 'orders': 1.0},
  {'price': 0.0, 'quantity': -247.0, 'orders': -2.0},
  {'price': 0.0, 'quantity': -75.0, 'orders': -10.0},
  {'price': 0.0, 'quantity': 31.0, 'orders': 4.0},
  {'price': 0.0, 'quantity': 31.0, 'orders': 4.0},
  {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
  {'pri

In [327]:
# Ignore cell
# def participation_ratio()

# def metrics(snapshot):

#     # find moving average for a rolling window
#     # Calculate the mid-price and append it to the list
#     mid_price = (snapshot['asks'][0]['price'] + snapshot['bids'][0]['price']) / 2
#     mid_prices.append(mid_price)

#     if c !<20:
#         # moving_avg20 = 
#         temp_ma20 = mid_prices[-20:]  # Extract last 20 values from the list
#         moving_avg20 = sum(temp_ma20) / 20  # Calculate the moving average
    
#     if c!<50:
#         temp_ma50 = mid_prices[-50:]  # Extract last 50 values from the list
#         moving_avg50 = sum(temp_ma50) / 50  # Calculate the moving average
    
#     if c!<100:
#         temp_ma100 = mid_prices[-100:]  # Extract last 100 values from the list
#         moving_avg100 = sum(temp_ma100) / 100  # Calculate the moving average

#     if c!<200:
#         temp_ma200 = mid_prices[-200:]  # Extract last 200 values from the list
#         moving_avg200 = sum(temp_ma200) / 200  # Calculate the moving average

#     # MACD?
#     # Other triggers

#     # temp = dic

#     # Participation ratio limits the number of orders an algorithm places in a minute.
#     # Cost to trade will be determined by the current snapshot
#     # Boundary conditions
#     # Extreme conditions (sudden drop in orders or qty indicating drastic price change)
#     # What else decides?
#     # 
#     # Return a dict saying Yes/No, Qty, Min Price allowed to go to to liquidate)
#     # dict = {'Trade': 0/1, 'quantity': 9717, 'min_price': 081.145}
#     # return dict
#     # return mid_prices
#     c = c+1
#     return None


In [32]:
# Initialize a variable to hold the previous snapshot
previous_snapshot = None

# Call the snapshot generator function
snapshot_gen = order_book_snapshot_generator(sliced_data, 10)

# Counter
c = 1

# Loop over snapshot_gen
for snapshot in snapshot_gen:
    # Process the order book snapshot for the current row

    # metrics = metrics(snapshot)
    # algorithm(snapshot, metrics)

    # If this is not the first snapshot, find the changes and print them
    if previous_snapshot is not None:
        
        # Print statements
        # print(f"snapshot factors: {previous_snapshot['factors']}")
        print(f"transacttime: {previous_snapshot['transacttime']}")
        print(f"Snapshot bids: {previous_snapshot['bids']}")

        # Find next second changes
        changes = find_order_book_changes(previous_snapshot, snapshot)

        # Print statements
        print(f"start_time: {changes['start_time']}\nend_time: {changes['end_time']}")
        print(f"Changes: {changes}\n")

    # Set the current snapshot as the previous snapshot for the next iteration
    previous_snapshot = snapshot

    # Increment counter
    c += 1

    # This if condition limits the outputs to 5 snapshots.
    # You should know why 5 and not 7. Read the report.
    # Uncommenting this line would print out all the snapshots generated for the sliced data
    if c == 7:
        break

transacttime: 17:00:00
Snapshot bids: [{'price': 130.5, 'quantity': 344.0, 'orders': 11.0}, {'price': 130.484375, 'quantity': 377.0, 'orders': 16.0}, {'price': 130.46875, 'quantity': 449.0, 'orders': 24.0}, {'price': 130.453125, 'quantity': 1150.0, 'orders': 34.0}, {'price': 130.4375, 'quantity': 1297.0, 'orders': 49.0}, {'price': 130.421875, 'quantity': 1708.0, 'orders': 59.0}, {'price': 130.40625, 'quantity': 2933.0, 'orders': 64.0}, {'price': 130.390625, 'quantity': 1806.0, 'orders': 67.0}, {'price': 130.375, 'quantity': 3535.0, 'orders': 66.0}, {'price': 130.359375, 'quantity': 2103.0, 'orders': 63.0}]
start_time: 17:00:00
end_time: 17:00:01
Changes: {'bids': [{'price': 0.0, 'quantity': 172.0, 'orders': 6.0}, {'price': 0.0, 'quantity': 211.0, 'orders': 5.0}, {'price': 0.0, 'quantity': 461.0, 'orders': 10.0}, {'price': 0.0, 'quantity': 91.0, 'orders': 11.0}, {'price': 0.0, 'quantity': 8.0, 'orders': 1.0}, {'price': 0.0, 'quantity': 11.0, 'orders': 2.0}, {'price': 0.0, 'quantity': 20

In [329]:
# This is how we checked if the m1_list was what we wanted.
# If len(m1_list) and len(unique_start_times) are equal, then we have what we want 
# len(m2_list)
# m2_list[0][0]
# t1
# m1_list = wrap_with_list(m1_list)
# m1_list[0][0]
# unique_start_times = set(d['start_time'] for d in m1_list)
# len(unique_start_times)

# changes['end_time']


In [330]:
import math
# import icecream
# from icecream import ic
def divide_quantity_by_orders(dict1):
    order_book = dict1.copy()

    del order_book['asks'] # we do this because we are just modelling replenishment on the bid side of the book

    for side in ['bids']:

        # You'll understand the purpose of this counter when you see where it is used
        # case_1_counter = 0
        case_1_indexes = []
        for level in order_book[side]:
            
            # Is there any change?
            if level['quantity'] != 0:
                # Is there any change in the orders qty?
                if level['orders'] != 0:
                    # -ve qty and -ve orders = +ve
                    if level['quantity'] < 0 and level['orders'] < 0:
                        quotient = round(level['quantity'] / (level['orders']))
                    
                    # +ve qty and +ve orders = +ve
                    elif level['quantity'] >0 and level['orders'] >0:
                        quotient = round(level['quantity'] / level['orders'])
                    
                    # +ve qty and -ve orders = -ve
                    elif level['quantity'] >0 and level['orders'] <0:
                        quotient = level['quantity'] / ((-1)*level['orders'])
                    
                    # -ve qty and +ve orders = -ve
                    elif level['quantity']<0 and level['orders']>0:
                        quotient = level['quantity'] / level['orders']
                            
                    if math.isfinite(quotient) and not math.isnan(quotient):
                        level['quantity'] = round(quotient)
                    else:
                        level['quantity'] = 0
                    level['orders'] = 1
                
                # If some change and 0 orders, some txn with equal buy and sell orders with different quantities
                # So just note down the levels to model this level
                elif level['orders']== 0:
                    # case_1_indexes appended the level index of the unique situation
                    case_1_indexes.append(level)

            # If no change in that level
            elif level['quantity'] == 0:
                continue
    
    if len(case_1_indexes)>0:
        for side in ['bids']:
            for level in case_1_indexes:
                # divide by avg of denominator for all other negative levels
                # this is edge case so just made 0 for now for simplicity
                level['quantity'] = 0
                level['orders'] = 0
    

    return order_book

def positive_values_in_dict(dict3):
    indexes = []
    count = 0
    # print(f"Checking if there are any positive values in this second...")
    # Counts how many negative values there are in the next changes
    for side in ['bids']:
        for level in dict3[side]:
            count += 1
            if level['quantity'] > 0:
                # print(f"Level: {level}")
                # print(f"Quantity: {level['quantity']}")
                # indexes has the indexes(level) of all the positive quantity changes
                indexes.append(count-1)
    
    return indexes, count

def negative_values_in_dict(dict3):
    indexes = []
    count = 0
    # print(f"Checking if there are any negative values in this second...")
    # Counts how many negative values there are in the next changes
    for side in ['bids']:
        for level in dict3[side]:
            count += 1
            if level['quantity'] < 0:
                # print(f"Level: {level}")
                # print(f"Quantity: {level['quantity']}")
                # indexes has the indexes(level) of all the positive quantity changes
                indexes.append(count-1)
    
    return indexes, count

def check_negative_quantity_add_keys(t6):
    
    # We create a copy because we do not want to modify the input
    order_book = t6.copy()

    # Just a counter, becomes a boolean output of this function
    created_or_not = 0

    indexes_neg2, count_neg2 = negative_values_in_dict(order_book)

    # print(f"negative indexes list = {indexes_neg2}")

    for side in ['bids']:
        for level in indexes_neg2:
            # This is the dictionary we will use to keep track of replenishment for each level in a single snapshot
            time_replenishment_percentage_dict = {'replenishment_done': 0,
                                                    'time': [t6['end_time']], #[17:00:00, 17:00:0, 17:35:00]
                                                    'replenishment_percentage': [0],
                                                    'replenishment_quantity': [0],
                                                    'appended': 0}
            
            # Add the dictionary as the value to the new key 'replenish'
            order_book[side][level]['replenish'] = time_replenishment_percentage_dict
            # level.setdefault('replenish', time_replenishment_percentage_dict)
            created_or_not += 1

    if created_or_not>0:
       # Create a new key for the whole dict
       # Will be marked 1 if all sides in dict are replenished
       order_book['replenishment_completed'] = 0
       created_or_not = True
    
    elif created_or_not==0:
       created_or_not = False
       # order_book will just a copy of the input. We won't use this anywhere. We will go to the next loop so we just pass for namesake
    
    return order_book, created_or_not

def check_negative_quantity_append_list(ob_snapshot, m2_list):
    # Not needed to declare as list, we could've done with a dict
    # When i started, i had a different idea in mind. I changed it so i no longer need a list
    # For time constraints and convenience, i will just keep it in this format and always access the dict using ,m1_list[0]
    m1_list = []
    c = 0
    
    # We get the negative quantity level indexes of ob_snapshot
    level_indexes_neg1, level_count_neg1 = negative_values_in_dict(ob_snapshot)
    
    if level_count_neg1>0:
        # Appending to empty list so this is the first item
        m1_list.append(ob_snapshot)
        # m2_list will be empty initially but then it can have items, so we keep appending to the bottom of the list
        m2_list.append(m1_list)
    
    del m1_list
    return m2_list

def m2_list_neg_level_index(m2_list):
    level_indexes_neg_list = []
    level_count_neg_list = []
    
    # We use print to debug
    # print(f"length of m2_list: {len(m2_list)}")

    # We extract all negative indexes of the individual dicts present in m1_list in m2_list
    for m1_list in m2_list:
        level_indexes_neg, level_count_neg = negative_values_in_dict(m1_list[-1]) 
        # we check negative quantity in last dict in m1_list because all the negative quantity will remain at the same level
        
        # We don't check if count_neg > 0 because items in m1_list will have negative quantities at some level in the first place

        # Everytime we get a list of indexes, we append that to a master list
        # Why we don't do this step for indexes_pos is because there is only 1 of it and there could be more than 1 level_indexes_neg because of m2_list
        level_indexes_neg_list.append(level_indexes_neg)
        # We append the count to list
        level_count_neg_list.append(level_count_neg)
    
    return level_indexes_neg_list, level_count_neg_list

def check_common_index(list1, list2):
    # This function is used inside m2_list_common_level_index() function below
    common_index = []
    count = 0
    for item in list1:
        if item in list2:
            common_index.append(item)
            count += 1
    return common_index, count

def m2_list_common_level_index(level_count_neg_list, level_indexes_neg_list, indexes_pos):               
    # These lists are unique everytime we create it in this nested conditon
    # These are wrt to the current state of m2_list
    # m2_list will change before this elif count_pos > 0 gets over
    common_level_indexes_list = []
    common_level_count_list = []

    # We check if there are any common indexes bw index_pos and indexes_neg_list[i]
    for i in range(len(level_count_neg_list)):
        # if negative indexes share any values with positive indexes, get that into a common_index
        common_level_indexes, common_level_indexes_count = check_common_index(level_indexes_neg_list[i],indexes_pos)
        # Everytime we get a list of indexes, we append that to a master list
        common_level_indexes_list.append(common_level_indexes)
        # We append the count to list
        common_level_count_list.append(common_level_indexes_count)

    return common_level_indexes_list, common_level_count_list
         
def check_for_fully_replenished_list(m2_list, m3_list, level_indexes_neg_list, completed_replenishment_dict):
    
    # We check if we can update m3_list and remove the list from m2_list
    
    m1_list_index_in_m2_list_to_remove = []
    # Pass empty list as input to empty master list
    # Done just to establish formatting
    # master_list_for_level_replenishments = []
    # Pick the indexes list
    # i is also the index of m1_list in m2_list
    # You should be able to intuitively understand why it is the same
    for i in range(len(level_indexes_neg_list)):
        # Extract level_indexes_neg
        level_indexes_neg = level_indexes_neg_list[i]
        for side in ['bids']:
            # Counter to keep track of total replenished levels
            level_replenishment_completed = 0
            # Iterate over all the negative levels in m2_list[i]
            for level in level_indexes_neg:
                
                # Extract replenish_dict to just check conditions,
                replenish_dict = m2_list[i][0][side][level]['replenish']
                # Check if replenishment is completed at that level
                if replenish_dict['replenishment_done']==1:
                    # Keep a cumulattive count of completion for whole side
                    level_replenishment_completed += 1
                    # level_replenishment_dict = replenish_dict

                    if replenish_dict['appended'] == 0:
                        completed_replenishment_dict[side][level].append(replenish_dict)
                        m2_list[i][0][side][level]['replenish']['appended'] = 1

                

            # If all replenishment completed at all possible levels for that m1_list[0]
            if level_replenishment_completed == len(level_indexes_neg):
                # Mark m1_list as completed
                m2_list[i][0]['replenishment_completed'] = 1
            # If replenishment not completed, check other side
            else:
                continue

    # Removing completed m1_list[0] from m2_list and transferring it to m3_list whenever there is a completed m1_list[0]
    for i in range(len(m2_list)):
        # print(f"m2_list length: {len(m2_list)}\n")
        if m2_list[i][0]['replenishment_completed'] == 1:
             
            # print(f"We make an update to m2_list and m3_list!!!!\n")
            
            # Extract that m1_list and add it to m3_list.
            m3_list.append(m2_list[i])
            
            # print("----------------------------------------------------------------------------------------------------------------------------")
            # print("----------------------------------------------------------------------------------------------------------------------------")
            # print(f"-----We APPEND {i}th list from m2_list to m3_list. Length of m2_list: {len(m2_list)}, Length of m3_list: {len(m3_list)}----")
            # print("----------------------------------------------------------------------------------------------------------------------------")
            # print("----------------------------------------------------------------------------------------------------------------------------\n")
           

            # Note down the index of m1_list to be removed from m2_list
            m1_list_index_in_m2_list_to_remove.append(i)

            # Make sure to delete that m1_list from m2_list
            # m2_list.remove(m2_list[i])

    # Note down the index of m1_list to be removed from m2_list
    indexes_to_remove = [i for i, val in enumerate(m2_list) if i in m1_list_index_in_m2_list_to_remove]

    # Remove the elements using their indexes
    for i in sorted(indexes_to_remove, reverse=True):
    #     print("--------------------------------------------------------------------------------------------------------------------------------")
    #     print("--------------------------------------------------------------------------------------------------------------------------------")
    #     print(f"------------We REMOVE {i}th list from m2_list. Length of m3_list: {len(m2_list)}, Length of m3_list: {len(m3_list)}------------")
    #     print("--------------------------------------------------------------------------------------------------------------------------------")
    #     print("--------------------------------------------------------------------------------------------------------------------------------")
        del m2_list[i]

    # We return updated lists
    return m2_list, m3_list, completed_replenishment_dict

# This function is incomplete
# Finish this function
# Check if it is appropriately called and all required inputs are proerly passed as inputs for al functions, and decide source function for the input.
# Just like the way we have done for all other variables, even m2_list
def extract_incomplete_replenishments(m2_list, incomplete_replenishment_dict):
    # This function is called only once at the end
    # We get the indexes we need
    level_indexes_neg_list, level_count_neg_list = m2_list_neg_level_index(m2_list)
    for i in range(len(level_indexes_neg_list)):
        # Extract level_indexes_neg
        level_indexes_neg = level_indexes_neg_list[i]
        for side in ['bids']:
            # Counter to keep track of total replenished levels
            # level_replenishment_completed = 0
            # Iterate over all the negative levels in m2_list[i]
            for level in level_indexes_neg:
                
                # Extract replenish_dict to just check conditions,
                replenish_dict = m2_list[i][0][side][level]['replenish']
                
                # Check if replenishment is not completed at that level
                # All levels at the end will be only levels which are not fully replenished
                if replenish_dict['replenishment_done']==0:
                    # We check if this has already been added to the dict
                    if replenish_dict['appended'] == 0:
                        incomplete_replenishment_dict[side][level].append(replenish_dict)
                        m2_list[i][0][side][level]['replenish']['appended'] = 1

    return m2_list, incomplete_replenishment_dict

def model_replenishment(common_level_indexes_list, m2_list, m3_list, changes, level_indexes_neg_list, completed_replenishment_dict):
    
    # Keep an original copy because if there is excess quantity after one m1_list and there are common_indexes, 
    # we will maintain this copy by reducing quantity everytime we replenish
    changes_copy = changes.copy()
    # This copy is to maintain the dictionary that will be added to the snapshot later
    # We don't need this in this function, we will use in a repleneshing_trades function
    # replenishment_dict_to_add = changes.copy()
    
    # Pick the indexes list
    # i is also the index of m1_list in m2_list
    # You should be able to intuitively understand why it is the same
    for i in range(len(common_level_indexes_list)):
        # Counter to keep track if there was excess quantity that can be used if there is a next loop. Else, we dont worry about it
        counter_for_excess_qty = 0
        # I dont need the below if condition because in the loop, channges_copy is getting modified everytime some replenishment happens
        # if counter_for_excess_qty > 0:
        #     changes_copy
        # Extract common_level_index
        common_level_index = common_level_indexes_list[i]
        for side in ['bids']:
            
            # print(f"\nWe are working on the {side} side.")
            # Iterate over all the common levels in m2_list[i]
            for index, level in enumerate(common_level_index):

                # print(f"\nWe are working on level: {index+1}...")
                # We enter a new level
                # print("\nWe enter a new level...")
                
                replenish_dict = m2_list[i][0][side][level]['replenish']
                
                if replenish_dict['replenishment_done'] == 1:
                    # print("Level already replenished, going to next level...")
                    # Skip that level since it has already been replenished
                    continue
            
                # print(replenish_dict)
                initial_quantity_of_negative_changes_sign_flipped = m2_list[i][0][side][level]['quantity']*(-1)
                # First will be 0. We access using last item in list. Understand intuition
                # Since we are accessing last item in the list, it will always be the latest replenishment percentage for that level
                percentage_before_replenishment = round(replenish_dict['replenishment_percentage'][-1], 3)
                # We multiply by -1 because quantity is negative in m2_list for these levels
                quantity_before_replenishment = round((percentage_before_replenishment*initial_quantity_of_negative_changes_sign_flipped))
                # this is the balance/excess of that level after replenishment so can be negative, 0 or positive.
                quantity_if_positive_changes_fully_added_as_replenishment = quantity_before_replenishment + changes_copy[side][level]['quantity']
                
                # We again multiply be -1 because quantity is negative
                percentage_after_replenishment = round((quantity_if_positive_changes_fully_added_as_replenishment / (1*initial_quantity_of_negative_changes_sign_flipped)), 3)
                
                # # We use print to debug
                # print(f"initial_quantity_of_negative_changes_sign_flipped: {initial_quantity_of_negative_changes_sign_flipped}")

                # # We use print to debug
                # print(f"Percentage_before_replenishment: {percentage_before_replenishment}")

                # # We use print to debug
                # print(f"quantity_before_replenishment: {quantity_before_replenishment}")

                # # We use print to debug
                # print(f"positive_changes: {changes_copy[side][level]['quantity']}")

                # # We use print to debug
                # print(f"quantity_if_positive_changes_fully_added_as_replenishment: {quantity_if_positive_changes_fully_added_as_replenishment}\n")

                # print("We now check if we have less/perfect/excess replenishment...")
                if percentage_after_replenishment < 1: 
                # It means we need more qty to replenish that level, so we just create a new key with the replenishment qty and % details    
                # At that level, the positive change has fully replenished the first m1_list in m2_list
                    
                    # We use print to debug
                    # print(f"Replenishment INSUFFICIENT,Percentage_after_replenishment: {percentage_after_replenishment}")

                    # Replenished quantity is the difference
                    replenished_quantity = quantity_if_positive_changes_fully_added_as_replenishment - quantity_before_replenishment
                    
                    # We store the quantity that was replenished at that second
                    replenish_dict['replenishment_quantity'].append(replenished_quantity)

                    # We reduce the replenished qty from the changes_copy
                    # We do this because we want to keep track of how much is getting replenished from the current positive changes
                    changes_copy[side][level]['quantity'] -= replenished_quantity
                    # The balance quantity that can be used to replenish older trades ranges from 0 to initial_quantity at that level in changes. 
                    # replenishment_quantity exhausted before negative change exhausted at that level
                    # Understand intuition

                    # Append time in the replenish_dict inside m1_list in m2_list
                    replenish_dict['time'].append(changes['end_time']) 

                    # Append the percentage replenished
                    replenish_dict['replenishment_percentage'].append(percentage_after_replenishment)

                    # Make all changes on m2_list
                    m2_list[i][0][side][level]['replenish'] = replenish_dict
                    
                    # print(f"After changes to replenish_dict: {replenish_dict}")
                    # we go to the next level
                    continue

                elif percentage_after_replenishment == 1: 
                # it means quantity replenished by new change perfectly replenishes the negative quantity
                # key word: perfectly

                    # print(f"Replenishment PERFECT, Percentage_after_replenishment: {percentage_after_replenishment}")

                    # Replenished quantity is the difference
                    replenished_quantity = quantity_if_positive_changes_fully_added_as_replenishment - quantity_before_replenishment

                    # We store the quantity that was replenished at that second
                    replenish_dict['replenishment_quantity'].append(replenished_quantity)

                    # We reduce the replenished qty from the changes_copy
                    # We do this because we want to keep track of how much is getting replenished from the current positive changes
                    changes_copy[side][level]['quantity'] -= replenished_quantity
                    # The balance quantity that can be used to replenish older trades is 0. 
                    # replenishment_quantity exhausted perfectly when negative change exhausted at that level
                    # Understand intuition

                    # Mark replenishment as done for that level
                    replenish_dict['replenishment_done'] = 1
                    
                    # Append time in the replenish_dict inside m1_list in m2_list
                    replenish_dict['time'].append(changes['end_time']) 

                    # Append the percentage replenished
                    replenish_dict['replenishment_percentage'].append(percentage_after_replenishment)

                    # Make all changes on m2_list
                    m2_list[i][0][side][level]['replenish'] = replenish_dict

                    # We use print to debug
                    # print(f"After changes to replenish_dict: {replenish_dict}")

                    # we go to the next level
                    continue
                
                elif percentage_after_replenishment > 1: 
                # it means we have excess quantity after filling that level in the previous snapshot

                    # print(f"Replenishment EXCESS, Percentage_after_replenishment: {percentage_after_replenishment}")
                    # We keep whenever there is an overfill during replenishment
                    counter_for_excess_qty += 1

                    # Replenished quantity is the difference
                    # replenished_quantity = quantity_if_positive_changes_fully_added_as_replenishment - quantity_before_replenishment
                    # if we calculate using same way, excess replenishment_quantity will also be added to replenished_quantity
                    
                    # This is the positive quantity that is required to fill that level to 100% replenishment
                    # initial_quantity_of_negative_changes is negative so we just add
                    excess_quantity = changes_copy[side][level]['quantity'] + initial_quantity_of_negative_changes_sign_flipped
                    # Understand intuition

                    # excess quantity_to_fully_replenish
                    quantity_to_fully_replenish = changes_copy[side][level]['quantity'] - excess_quantity

                    # We use print to debug
                    # print(f"Excess quantity after replenishment: {excess_quantity}")
                    
                    # This is the quantity which will be replenished at that level
                    replenished_quantity = quantity_to_fully_replenish
                    # Understand intuition. That level is fully exhausted and there is excess quantity

                    # We store the quantity that was replenished at that second
                    replenish_dict['replenishment_quantity'].append(replenished_quantity)

                    # We reduce the replenished qty from the changes_copy
                    # We do this because we want to keep track of how much is getting replenished from the current positive changes
                    changes_copy[side][level]['quantity'] -= replenished_quantity
                    # The balance quantity that can be used to replenish older trades is always greater than 0
                    # negative change exhausted before replenishment_quantity exhausted at that level
                    # Understand intuition

                    # Percentage calculated earlier will be larger than 1. You should intuitively know why that's the case
                    # So we set it to 1. signifying 100% replenishment
                    percentage_after_replenishment = 1.

                    # Mark replenishment as done for that level
                    replenish_dict['replenishment_done'] = 1

                    # Append time in the replenish_dict inside m1_list in m2_list
                    replenish_dict['time'].append(changes['end_time']) 

                    # Append the percentage replenished
                    replenish_dict['replenishment_percentage'].append(percentage_after_replenishment)

                    # Make all changes on m2_list
                    m2_list[i][0][side][level]['replenish'] = replenish_dict

                    # This is the quantity that will be put on the the last quantity_replenished
                    quantity_after_replenishment = -1* initial_quantity_of_negative_changes_sign_flipped

                    # We use print to debug
                    # print(f"quantity_after_replenishment: {quantity_after_replenishment}")
                    # print(f"After changes to replenish_dict: {replenish_dict}")
                    # go to next level
                    continue
            
            # print(f"All levels over for {side}")
        

        # If there is no excess quantity at any level after repleneshing the first m1_list
        if counter_for_excess_qty==0:
            # print(f"We dont have any excess quantity left, so we don't iterate to next m1_list, we start checking if we can update m2_list and m3_list...")
            # Don't iterate to next common_level_index
            break
        
    # We have come out of the for loop, we have finished replenishment for all m1_lists[0] in m2_list
        
    # we check if any m1_list[0] in m2_list was fully replenished(all levels)
    # m3_list contains the fully replenished lists. it will be cut from m2_list and pasted to m3_list.
    # this way, m2_list only contains the lists than need to be replenished
    m2_list, m3_list, completed_replenishment_dict = check_for_fully_replenished_list(m2_list, m3_list, level_indexes_neg_list, completed_replenishment_dict)

    # incomplete_replenishment_dict = check_for_incomplete_list(m2_list, incomplete_replenishment_dict)

    # We don't need this because we are not replenshing anything. We are just making list of all the replenishments that happen. That is in m3_list
    # replenishment_dict_to_be_added_to_snapshot = replenish_dict_calculator(ob_changes, temp_changes)
    
    return m2_list, m3_list, completed_replenishment_dict


In [331]:
date_range_ob_fwdfill_df

Unnamed: 0,date,min_time,max_time,no_of_ob_updates
0,2020-01-28,16:45:00,23:59:59,26100
1,2020-01-29,00:00:00,16:00:00,57601
2,2020-03-17,16:45:00,23:59:57,26098
3,2020-03-18,00:00:00,16:00:00,57601
4,2020-04-29,00:00:00,16:00:00,57601
5,2020-06-09,16:45:00,23:59:59,26100
6,2020-06-10,00:00:00,16:00:00,57601
7,2020-07-28,16:45:00,23:59:57,26098
8,2020-07-29,00:00:00,16:00:00,57601
9,2020-08-25,16:45:00,23:59:59,26100


In [332]:
((37*11) + (37*2.5)*(26-11))/60

29.908333333333335

In [333]:
# completed_replenishment_dict

In [334]:
# This function calculates how much replenishment is for every level for a single date
def model_historical_replenishment_rates_single_date(orderbook_data):
    # Initialize a variable to hold the previous snapshot
    previous_snapshot = None

    snapshot_gen = order_book_snapshot_generator(orderbook_data, 10)
    c = 0
    m2_list =[]
    m3_list = []
    # Initialize dict for bid and ask. Values cintain an empty list of 10 lists representing 10 levels
    # This is the format we append and maintain just the replenished dicts for very level
    completed_replenishment_dict = {'bids': [[],[],[],[],[],[],[],[],[],[]],'asks': [[],[],[],[],[],[],[],[],[],[]]}
    # incomplete_replenishment_dict = {'bids': [[],[],[],[],[],[],[],[],[],[]],'asks': [[],[],[],[],[],[],[],[],[],[]]}
    # This is how u access it
    # completed_replenishment_dict[side][i][j]['replenishment_done', 'time', 'replenishment_percentage']

    for snapshot in snapshot_gen:
        # snapshot = snapshot['bids']
        # Process the order book snapshot for the current row
        # print(snapshot)
        # metrics = metrics(snapshot)
        # algorithm(snapshot, metrics)

        # Without 2 snapshots, we can't find changes, so we skip calculating changes for first snapshot
        if previous_snapshot is not None:
            
            # We find the changes
            changes = find_order_book_changes(previous_snapshot, snapshot)
            
            # We use print to debug
            # print(f"{changes}")
            # We use print to debug
            # print(f"\n----------------------------------NEW SECOND Transacttime: {str(changes['end_time'])}----------------------------------")

            # We normalize the quantity to get the changes in terms of single(1) orders
            # not normalizing increases computation time exponentially. Why is that?
            t1 = divide_quantity_by_orders(changes)
            
            # We use print to debug
            # print(f"{t1} \n")

            # We add our required 'replenish' key to the levels which contain a negative quantity
            # Refer the function for details about the value for this key
            t2, created_or_not = check_negative_quantity_add_keys(changes)
            # this will be done regardless of any positive quantity in the current changes

            # If we added some new key because there was a negative change, we add that to m2_list
            if created_or_not == True:
                # We use print to debug
                # print(f"We have created new key called replenish in the changes dict for the past 1 second...")
                # print(f"Below is the dict with the new key we will append to m2_list as m1_list[0] using function check_negative_quantity_append_list(t2, m2_list)")
                # print(f"{t2}")
                
                # We get the indexes of the positive quantities in the quantities from the next second changes we will use to replenish,
                # and the negative indexes of the 

                # m1_lists[0] will remain on m2_list till they are replenished and once they are replenished, they are moved to m3_list
                # for the first time, m2_list will be empty, but in subsequent loops, it will have a value. 
                # If all trades are replenished, it can be empty again.
                # m1_list will be created inside this function it will be appended to m2_list
                m2_list = check_negative_quantity_append_list(t2, m2_list)
                # For first time, m2_list just contains 1 m1_list as m2_list[0]. Then m2_list could keep varying in size.
                
                # We use print to debug
                # print(f"{m2_list} \n")

            # we check if we have any positive quantity and if there is we get an index
            indexes_pos, count_pos = positive_values_in_dict(changes)

            # If there is no positive changes in all the levels, go to next snapshot
            if count_pos == 0:
                # Skip replenishment
                # print("There is no positive quantity in the changes that happened in the past 1 second. So we go to the next second...")
                # return? this whole block is not a function ye
                # We go to the next snapshot becasue nothing to replenish?
                continue

            # If there is some positive quantity, replenish
            elif count_pos > 0:
                # if m2_list is not empty,
                if m2_list:
                    # print(f"We can now check if we can replenish negative quantity changes that happened in the past...")
                    # Get the levels of negative quantity in m1_list[0] for all m1_list in m2_list                
                    level_indexes_neg_list, level_count_neg_list = m2_list_neg_level_index(m2_list)

                    # Get the common levels(negative and positive quantity) in m1_list[0] for all m1_list in m2_list              
                    common_level_indexes_list, common_level_count_list = m2_list_common_level_index(level_count_neg_list, level_indexes_neg_list, indexes_pos)
                    
                    # If there are some common indexes,
                    if sum(common_level_count_list) >0:

                        # This checks for whole m2_list all common levels.
                        # Is this step even required? if everything is replenished, it will not be on m2 in the first place
                        # replenishment_fully_done = is_replenishment_fully_done(m2_list, common_level_indexes_list)
                        # if replenishment_fully_done == False:

                        # print("\nThere are some common levels where we can maybe replenish. So we CALL replenish function...")
                        # Call replenish function
                        m2_list, m3_list, completed_replenishment_dict = model_replenishment(common_level_indexes_list, m2_list, m3_list, changes, level_indexes_neg_list, completed_replenishment_dict)
                        # printf(f"Length of m3_list: ")
                # print(changes)

                else:
                    continue

        # Set the current snapshot as the previous snapshot for the next iteration
        previous_snapshot = snapshot

        # We can use this counter to limit during debugging/testing
        # c += 1
        # if c == 500:
        #     break

    return m2_list, m3_list, completed_replenishment_dict

def concat_datetime_strings(date_string, time_string):

    datetime_string = str(date_string + " " + time_string)

    return datetime_string

def model_historical_replenishment_rates_all_dates(ob_fwdfill, date_range_ob_fwdfill_df):

    # print("Entered function model_historical_replenishment_rates_all_dates")
    m2_list_all_dates = []
    m3_list_all_dates = []
    completed_replenishment_master_list = []
    incomplete_replenishment_dict = {'bids': [[],[],[],[],[],[],[],[],[],[]],'asks': [[],[],[],[],[],[],[],[],[],[]]}
    incomplete_replenishment_master_list = []
    # Date 
    for row_number in date_range_ob_fwdfill_df.index:
        # if row_number == 2:
        #     print("Aborting... next date will not be processed.")
        #     break

        # Select time window of 2 hours. Example '2020-01-28 16:45:23'
        # start_time = '2020-01-28 17:00:00'
        # end_time = '2020-01-28 19:00:00'

        start_time = concat_datetime_strings(date_range_ob_fwdfill_df['date'][row_number], date_range_ob_fwdfill_df['min_time'][row_number])
        end_time = concat_datetime_strings(date_range_ob_fwdfill_df['date'][row_number], date_range_ob_fwdfill_df['max_time'][row_number])

        print(f"Working on {start_time} to {end_time}...\n")

        sliced_data = window_slicer(ob_fwdfill, start_time, end_time)
        sliced_data = sliced_data.reset_index(drop=True)
        # sliced_data
        
        # We get different sliced data for every date
        m2_list, m3_list, completed_replenishment_dict = model_historical_replenishment_rates_single_date(sliced_data)
       
        m2_list, incomplete_replenishment_dict = extract_incomplete_replenishments(m2_list, incomplete_replenishment_dict)
        incomplete_replenishment_master_list.append(incomplete_replenishment_dict)
        m2_list_all_dates.append(m2_list)
        m3_list_all_dates.append(m3_list)
        completed_replenishment_master_list.append(completed_replenishment_dict)
        print("Completed. Moving to next date...\n")

    return m2_list_all_dates, m3_list_all_dates, completed_replenishment_master_list, incomplete_replenishment_master_list



In [None]:
m2_list_all_dates, m3_list_all_dates, completed_replenishment_master_list, incomplete_replenishment_master_list = model_historical_replenishment_rates_all_dates(ob_factors, date_range_ob_fwdfill_df[:21])
# m2_list_all_dates, m3_list_all_dates, completed_replenishment_master_list, incomplete_replenishment_master_list = model_historical_replenishment_rates_all_dates(other_df, date_range_ob_fwdfill_df)

Working on 2020-01-28 16:45:00 to 2020-01-28 23:59:59...

Completed. Moving to next date...

Working on 2020-01-29 00:00:00 to 2020-01-29 16:00:00...

Completed. Moving to next date...

Working on 2020-03-17 16:45:00 to 2020-03-17 23:59:57...

Completed. Moving to next date...

Working on 2020-03-18 00:00:00 to 2020-03-18 16:00:00...

Completed. Moving to next date...

Working on 2020-04-29 00:00:00 to 2020-04-29 16:00:00...

Completed. Moving to next date...

Working on 2020-06-09 16:45:00 to 2020-06-09 23:59:59...

Completed. Moving to next date...

Working on 2020-06-10 00:00:00 to 2020-06-10 16:00:00...

Completed. Moving to next date...

Working on 2020-07-28 16:45:00 to 2020-07-28 23:59:57...

Completed. Moving to next date...

Working on 2020-07-29 00:00:00 to 2020-07-29 16:00:00...

Completed. Moving to next date...

Working on 2020-08-25 16:45:00 to 2020-08-25 23:59:59...

Completed. Moving to next date...

Working on 2020-08-26 00:00:00 to 2020-08-26 16:00:00...

Completed. M

In [335]:
# len(completed_replenishment_master_list['bids'][0])

In [336]:
# incomplete_replenishment_master_list

In [337]:
import numpy as np

def replenishment_rates_level(completed_replenishment_master_list, incomplete_replenishment_master_list):
    
    avg_replenishment_quantity_per_second_all_levels_all_dates_list = []
    completed_counter_level_list = []

    # len(completed_replenishment_dict[side][i][j]['replenishment_done', 'time', 'replenishment_percentage'])
    for b in range(len(completed_replenishment_master_list)):
        print("\nXXXXXXXXXXXXXXXXX\nXXXXXXXXXXXXXXXXX\nNew date")
        completed_counter_level = []
        avg_replenishment_quantity_per_second_all_levels_list = []
        for level in range(0,10): # 10 means 10 levels
            
            # time_taken_for_replenishment_list = []
            all_replenishment_quantity_per_second_at_level_list = []
            completed_counter = 0
            print(f"\nNew Level: {level}")
            # Rewrite for using enumerate
            for index_number in range(len(completed_replenishment_master_list[b]['bids'][level])):
                
                print(f"Count: {completed_counter}")
                completed_counter += 1                
                print(f"{completed_replenishment_master_list[b]['bids'][level][index_number]}")#['replenishment_percentage']#, 'time', 'replenishment_percentage']
                trade_time = completed_replenishment_master_list[b]['bids'][level][index_number]['time'][0]
                replenishment_completed_time = completed_replenishment_master_list[b]['bids'][level][index_number]['time'][-1]
                replenishment_completed_quantity = -1*completed_replenishment_master_list[b]['bids'][level][index_number]['replenishment_quantity'][-1]
                # print(f"replenishment_completed_quantity: {replenishment_completed_quantity}")

                # Convert to datetime to subract and find difference
                trade_time = datetime.strptime(trade_time, '%H:%M:%S')
                replenishment_completed_time = datetime.strptime(replenishment_completed_time, '%H:%M:%S')
                # Find difference in seconds
                time_taken_for_replenishment = (replenishment_completed_time - trade_time).total_seconds()
        
                # How do we find a replenishment rate?
                # Method 1
                # Per second how much is got replenished at that level
    
                replenishment_quantity_per_second_at_level = replenishment_completed_quantity / time_taken_for_replenishment 

                # Method 2
                # seconds_taken_for_replenishment_per_quantity_at_level = time_taken_for_replenishment /  replenishment_completed_quantity
                # this can be used when the snapshots are not forward filled. When the data is not coming in seconds interval.
                # Instead of the replenishment happening as a function of time, it will happen as a function of quantity
                # This method requires some changes in the replenish_past_algo_trades below
                # doing 2 ways can help cross check if replenishment is happening correctly because the
                
                # Self explanatory name
                all_replenishment_quantity_per_second_at_level_list.append(replenishment_quantity_per_second_at_level)

            # We find mean of that level
            avg_replenishment_quantity_per_second_for_level = np.mean(all_replenishment_quantity_per_second_at_level_list)
            completed_counter_level.append(completed_counter)
            # print(f"Avg quantity replenished every minute on Level {level+1} is {avg_quant_repl_min_level}")
            # print(f"Avg minutes to replenish a single lot on Level {level+1} is {avg_min_repl_single_lot_level}\n")
        
            # Meen of that level is appended to another list
            avg_replenishment_quantity_per_second_all_levels_list.append(avg_replenishment_quantity_per_second_for_level)

        avg_replenishment_quantity_per_second_all_levels_all_dates_list.append(avg_replenishment_quantity_per_second_all_levels_list)
        completed_counter_level_list.append(completed_counter_level)

    incomplete_counter_level_list = []
    zero_replenishment_level_list = []
    
    for b in range(len(incomplete_replenishment_master_list)):
        print("\nXXXXXXXXXXXXXXXXX\nXXXXXXXXXXXXXXXXX\nNew date")
        if b == 1:
            break

        incomplete_counter_level = []
        avg_replenishment_quantity_per_second_all_levels_list = []
        zero_replenishment_counter_level = []
        
        for level in range(0,10): # 10 means 10 levels
            
            # time_taken_for_replenishment_list = []
            replenishment_percentage_list = []
            time_taken_for_incomplete_replenishment_list = []
            zero_replenishment_counter_list = []

            incomplete_counter = 0
            zero_replenishment_counter = 0
            print(f"\nNew Level: {level}")
            # Rewrite for using enumerate
            for index_number in range(len(incomplete_replenishment_master_list[b]['bids'][level])):
                
                print(f"Count: {incomplete_counter}")
                incomplete_counter += 1
                print(f"{incomplete_replenishment_master_list[0]['bids'][level][index_number]}")#['replenishment_percentage']#, 'time', 'replenishment_percentage']
                trade_time = incomplete_replenishment_master_list[b]['bids'][level][index_number]['time'][0]
                incomplete_replenishment_time = incomplete_replenishment_master_list[b]['bids'][level][index_number]['time'][-1]
                print(f"incomplete_replenishment_time: {incomplete_replenishment_time}")
                replenishment_percentage = -1*incomplete_replenishment_master_list[b]['bids'][level][index_number]['replenishment_percentage'][-1]
                # print(f"replenishment_percentage: {replenishment_percentage}")

                if replenishment_percentage == 0:
                    zero_replenishment_counter += 1
                else:
                    replenishment_percentage *= -1
                    replenishment_percentage_list.append(replenishment_percentage)
                    print(f"replenishment_percentage: {replenishment_percentage}")
                    # Convert to datetime to subract and find difference
                    trade_time = datetime.strptime(trade_time, '%H:%M:%S')
                
                    incomplete_replenishment_time = datetime.strptime(incomplete_replenishment_time, '%H:%M:%S')
                    # Find difference in seconds
                    time_taken_for_incomplete_replenishment = (incomplete_replenishment_time - trade_time).total_seconds()
                    time_taken_for_incomplete_replenishment_list.append(time_taken_for_incomplete_replenishment)
                
            incomplete_counter_level.append(incomplete_counter)
            zero_replenishment_counter_level.append(zero_replenishment_counter)
            
            if sum(replenishment_percentage_list) > 0:
                # Create a dictionary with columns as keys and lists as values
                data_dict = {'replenishment_percentage': replenishment_percentage_list,
                            'time_taken_for_incomplete_replenishment': time_taken_for_incomplete_replenishment_list}
                            # Create a pandas DataFrame from the dictionary
                df = pd.DataFrame(data_dict)
            else:
                df = 0

            # If majority of incomplete replenishments are not zero,
            # we can create a surfact using time taken to achieve last replenishment, percentage replenished, and the respective counts for every unique pair of time and percentage replenished          
            # We find count which will be our third axis in the plot
            # for unique_replenishment in df['replenishment_percentage'].unique():
            #     dft = df[df['replenishment_percentage']==unique_replenishment]
            #     count = len(dft)
            #     count for unique replenishment


            # print(f"Avg quantity replenished every minute on Level {level+1} is {avg_quant_repl_min_level}")
            # print(f"Avg minutes to replenish a single lot on Level {level+1} is {avg_min_repl_single_lot_level}\n")
        
            # Meen of that level is appended to another list
            # avg_replenishment_quantity_per_second_all_levels_list.append(avg_replenishment_quantity_per_second_for_level)

        # avg_replenishment_quantity_per_second_all_levels_all_dates_list.append(avg_replenishment_quantity_per_second_all_levels_list)
        incomplete_counter_level_list.append(incomplete_counter_level)
        zero_replenishment_level_list.append(zero_replenishment_counter_level)


    # Use zero_replenishment_level_list and find out ratio of zeroes.
    # We need to output also the number of times the order replenishment needs


    # Find out how many times u need to skip a replenishment for an order
    return avg_replenishment_quantity_per_second_all_levels_all_dates_list, completed_counter_level_list, incomplete_counter_level_list, zero_replenishment_level_list, df

# len(m3_list_all_dates[0])
# m3_list_all_dates[0][0]
# m3_list[2]

In [338]:
avg_replenishment_quantity_per_second_all_levels_all_dates_list, completed_counter_level_list, incomplete_counter_level_list, zero_replenishment_level_list, df = replenishment_rates_level(completed_replenishment_master_list, incomplete_replenishment_master_list)
avg_replenishment_quantity_per_second_all_levels_all_dates_list
# Average quantity replenished per order per second for that level

NameError: name 'completed_replenishment_master_list' is not defined

In [339]:

# Works till this line


In [340]:
# completed_counter_level_list

NameError: name 'completed_counter_level_list' is not defined

In [341]:
def get_ratio_of_lists(list1, list2):
    ratio_list = []
    for i in range(10):
        print(f"{str(list1[i])} / {str(list2[i])}")
        ratio = list1[i] / list2[i]
        ratio_list.append(ratio)
    return ratio_list

def calculate_averages(avg_replenishment_quantity_per_second_all_levels_all_dates_list, completed_counter_level_list, incomplete_counter_level_list, zero_replenishment_level_list):
    
    
    avg_replenishment_quantity_per_second_all_levels_all_dates = [sum(vals)/len(vals) for vals in zip(*avg_replenishment_quantity_per_second_all_levels_all_dates_list)]
    ratio_zero_to_incomplete = get_ratio_of_lists(zero_replenishment_level_list[0], incomplete_counter_level_list[0])
    ratio_zero_to_completed = get_ratio_of_lists(zero_replenishment_level_list[0], completed_counter_level_list[0])
    ratio_complete_to_incomplete = get_ratio_of_lists(completed_counter_level_list[0], incomplete_counter_level_list[0])
    return avg_replenishment_quantity_per_second_all_levels_all_dates, ratio_zero_to_incomplete, ratio_zero_to_completed, ratio_complete_to_incomplete

avg_replenishment_quantity_per_second_all_levels_all_dates, ratio_zero_to_incomplete, ratio_zero_to_completed, ratio_complete_to_incomplete = calculate_averages(avg_replenishment_quantity_per_second_all_levels_all_dates_list, completed_counter_level_list, incomplete_counter_level_list, zero_replenishment_level_list)

NameError: name 'avg_replenishment_quantity_per_second_all_levels_all_dates_list' is not defined

In [335]:
# avg_replenishment_quantity_per_second_all_levels_all_dates

In [485]:
print(f"ratio_zero_to_incomplete :{ratio_zero_to_incomplete}\nratio_zero_to_completed: {ratio_zero_to_completed}\nratio_complete_to_incomplete : {ratio_complete_to_incomplete}")

ratio_zero_to_incomplete :[0.9993545611015491, 0.999128160418483, 0.9990448901623686, 0.9992307692307693, 0.9990448901623686, 0.9989873417721519, 0.9989748846745259, 0.9983379501385041, 0.9983853606027987, 0.9982003599280144]
ratio_zero_to_completed: [71.46153846153847, 63.666666666666664, 74.71428571428571, 57.733333333333334, 74.71428571428571, 61.65625, 62.87096774193548, 56.3125, 92.75, 50.42424242424242]
ratio_complete_to_incomplete : [0.013984509466437178, 0.015693112467306015, 0.013371537726838587, 0.01730769230769231, 0.013371537726838587, 0.01620253164556962, 0.015889287544848796, 0.01772853185595568, 0.010764262648008612, 0.01979604079184163]


In [486]:
avg_replenishment_quantity_per_second_all_levels_all_dates

[0.38548875866534427,
 0.2968284478761956,
 0.19869853940874252,
 0.2935878038702436,
 0.09307537261535587,
 0.1541179378052512,
 0.13174772956517022,
 0.054074532365372664,
 nan,
 nan]

In [92]:
replenishment_rates = [0.38548875866534427, 0.2968284478761956, 0.19869853940874252, 0.2935878038702436, 0.09307537261535587, 0.1541179378052512, 0.13174772956517022, 0.054074532365372664, 0.15, 0.2]
# result_list2

In [93]:
# Sum the completed and incompleted for those respective dates
# Find total
# Divide zero ounter by total for that level
# Find a percentage of zero for that level for that date
# Later we can take average of that and find on average how many 0s

In [94]:
# Let's use the above rates as replenishment rates for the 10 levels on the bid side
# We will compute better averages, and we will add for bid side too
# replenishment_rates = averages.copy()

In [95]:
# View the dates to choose to build the model
temp = date_range_ob_factors_df

In [96]:
# z = pd.to_datetime(y) - pd.Timedelta(days=1)
# z

In [97]:
# ob_factors

In [98]:
# vol_hour['hour'][7]

In [99]:
# total_row_count = len(vol_hour.loc[vol_hour['hour'] == 0].index)
# total_row_count

In [100]:
# ob_factors.columns

In [342]:
# vol_h = readparquet(path+"/"+str(file_names['VOLATILITY_HOUR'][0]))
# vol_h

In [487]:
def get_unique_trading_hours(xy):
    # Create a new DataFrame to store the unique Trading Hours for each Date
    unique_hours_df = pd.DataFrame(columns=['Date', 'Unique Trading Hours'])
    
    # Get a list of unique dates in the DataFrame
    unique_dates = xy['date'].unique()
    
    # Loop through each unique date
    for date in unique_dates:
        
        # Get the rows for the current date
        date_rows = xy[xy['date'] == date]
        
        # Get a list of unique Trading Hours for the current date
        unique_hours = date_rows['Trading Hours'].unique()
        
        # Add the unique Trading Hours to the new DataFrame
        temp_df = pd.DataFrame({'Date': date, 'Unique Trading Hours': [', '.join(unique_hours)]})
        unique_hours_df = pd.concat([unique_hours_df, temp_df], ignore_index=True)
    
    # Remove any duplicate rows in the new DataFrame
    unique_hours_df.drop_duplicates(inplace=True)
    
    # Return the new DataFrame
    return unique_hours_df

def add_time_from_hour(df):
    # Create a new column with the hour converted to a string
    df['hour_str'] = df['hour'].astype(str).str.zfill(2) + '0000'
    
    # Convert the new column to a datetime object
    df['start_time'] = pd.to_datetime(df['hour_str'], format='%H%M%S').dt.time
    
    # Create another column with the time value one hour more than 'time'
    df['end_time'] = (pd.to_datetime(df['hour_str'], format='%H%M%S') + pd.Timedelta(hours=1)).dt.time
    
    # Drop the hour_str column
    df = df.drop(columns=['hour_str'])
    
    # df = df.sort_values(by='volatility', ascending=False)

    # Return the modified dataframe
    return df

def vol_hour_loop(hours): # hours mean the number of volatile 1 hour windows u want to select from the whole day(2 dates)
                          # If 1, we take 1 hour from each file. 2 means 2 hours from each file.
                          # The hours if greater than 1, need not be in consecetive order
  for i in range(len(file_names['VOLATILITY_HOUR'])):
    vol_hour = readparquet(path+"/"+str(file_names['VOLATILITY_HOUR'][i]))
    # print(vol_hour)
    # print("Performing operations on the file...\n")
    vol_hour = add_time_from_hour(vol_hour)

    col_names = ['trade_date','volatility','start_time','end_time','hour']
    vol_hour = retain_columns(vol_hour,col_names)

    # convert the string column to a float column
    vol_hour['hour'] = vol_hour['hour'].astype(float)

    # Find the total count of 0 in the hour column
    total_row_count = vol_hour.loc[vol_hour['hour'] == 0].index
    # If 0 exists, it means there is data for 2 days.
    if total_row_count > 0:
      # find the row number of the value 0 in the column 'hour'
      row_number = vol_hour.loc[vol_hour['hour'] == 0].index[0]
      # We know CME's day starts differently. 
      # row_number helps us seperate the days in the df
      for j in range(row_number):
        # vol_hour['date']
        # Subtract one day from 'date' column
        vol_hour.loc[j, 'trade_date'] = pd.to_datetime(vol_hour['trade_date'][j]) - pd.Timedelta(days=1)


    # seperate that unto a df, change the date to 1 day prior to the date value in the cell
    # We do this because CME's day starts and end at unique times.
    # reorder them in descending order of volatility, append it to a df
    # seperate the df below the 0 including the 0 row. Then reorder in descending order of volatility
    # Append thesr rows to the df.
    # Now we have 2 dates and their volatilities in descending order. 
    # we can just filter the df and get the start and end time of highest vol period


        vol_hour = vol_hour.sort_values(by='volatility', ascending=False)
    # print(f"Completed operations.\n{vol_hour}")
    
    vol_hour = vol_hour.reset_index(inplace=False)
    vol_hour = vol_hour.drop('index', axis=1)
    
    # Extracting first X = len(date_index_level), columns from vol_hour. Only these dates will be used to build the model
    vol_hour = vol_hour[:hours]
    # print(f"We only use first {date_index_level} dates to build the model.\nThen we use {date_index_level+1} to use as validation for training and fine tuning the model.\nThen we use last {len(date_range_ob_factors_df)-date_index_level} for testing.")
    if i==0:
      vol = vol_hour
      # print(f"Creating df...\n")
    else:
      # print(f"Concatenating...\n")
      vol = pd.concat([vol, vol_hour])
  
  vol_hour = vol_hour.reset_index()
  print("Factors files have been processed and required information as been sent..\n")
  return vol

def check_within_date_range(date_range_df, hours):
  """
  Checks whether the start_time and end_time in the vol_hour dataframe is within the date range of the date_range_df.

  Args:
  - vol_hour: a pandas DataFrame containing columns 'trade_date', 'volatility', 'start_time', and 'end_time'
  - date_range_df: a pandas DataFrame containing columns 'date', 'min_time', and 'max_time'

  Returns:
  - A pandas DataFrame containing the same columns as the input DataFrame with an additional column 'within_date_range'
    indicating whether the start_time and end_time is within the date range of the date_range_df
  """
  t = vol_hour_loop(hours)
  vol_hour = t.copy()
  vol_hour['start_time'] = pd.to_datetime(t['start_time'], format='%H:%M:%S').dt.time
  vol_hour['end_time'] = pd.to_datetime(t['end_time'], format='%H:%M:%S').dt.time

  date_range_df['date'] = pd.to_datetime(date_range_df['date'], format='%Y-%m-%d')
  date_range_df['min_time'] = pd.to_datetime(date_range_df['min_time'], format='%H:%M:%S').dt.time
  date_range_df['max_time'] = pd.to_datetime(date_range_df['max_time'], format='%H:%M:%S').dt.time

  # Create a list to store the results
  results = []

  vol_hour['date_end_time'] = np.nan
  vol_hour['date_start_time'] = np.nan
  # Loop through each row in vol_hour dataframe
  for index, row in vol_hour.iterrows():

    # Get the trade_date, start_time and end_time from the row
    trade_date = row['trade_date']
    # trade_date = trade_date.date()
    start_time = row['start_time']
    end_time = row['end_time']
    # print(f"trade_date: {trade_date}")
    # print(f"start_time: {start_time}")
    # print(f"end_time: {end_time}")
    vol_hour.loc[index, 'date_end_time'] = str(pd.to_datetime(str(trade_date) + " " + str(end_time)))
    vol_hour.loc[index, 'date_start_time'] = str(pd.to_datetime(str(trade_date) + " " + str(start_time)))
    # row.loc['date_end_time'] = str(row['trade_date']) + " " + str(row['end_time'])
    # row.loc['date_start_time'] = str(row['trade_date']) + " " + str(row['start_time'])

    # print(f"row.loc['date_end_time'] : {row.loc['date_end_time'] }")
    # print(f"row.loc['date_start_time']: {row.loc['date_start_time']}\n")

    # Get the date range dataframe for the current trade_date
    date_range = date_range_df[date_range_df['date'] == trade_date]

    # Check if start_time and end_time are between min_time and max_time for the current trade_date
    if len(date_range) > 0 and (start_time >= date_range['min_time'].iloc[0]) and (end_time <= date_range['max_time'].iloc[0]):
      result = 'Yes'
    else:
      print(f"Error: Index out of bounds for trade_date={trade_date}, start_time={start_time}, and end_time={end_time}")
      print(f"Min time: {date_range['min_time'].iloc[0]}, Max time: {date_range['max_time'].iloc[0]}")
      result = 'No'

    # # Append the result to the list of results
    results.append(result)

  # Add the list of results as a new column to the vol_hour dataframe
  vol_hour['within_date_range'] = results

  return vol_hour

def filter_xy_by_datetime_range(xy, vol_hour):
    # Create an empty DataFrame to store the filtered results
    filtered_xy = pd.DataFrame(columns=xy.columns)
    
    # Loop through each row in the vol_hour DataFrame
    for _, row in vol_hour.iterrows():
         
        # Get the trade_date, start_time, and end_time from the current row
        trade_date = row['trade_date']
        start_time = row['start_time']
        end_time = row['end_time']
        
        # Filter the rows in the xy DataFrame for the current trade_date
        xy_date_rows = xy[xy['date'] == trade_date]
        temp5 = xy_date_rows['delp'].shift(1)
        xy_date_rows.loc[:, 'delp'] = temp5

        # Filter the rows based on the start and end times for the current date
        filtered_rows = xy_date_rows[(xy_date_rows['transacttime'] >= start_time) & (xy_date_rows['transacttime'] <= end_time)]
        
        # Append the filtered rows to the results DataFrame
        filtered_xy = pd.concat([filtered_xy, filtered_rows])
    
    # Reset the index of the filtered DataFrame
    filtered_xy = filtered_xy.reset_index(drop=True)
    
    # Return the filtered DataFrame
    return filtered_xy

def retain_columns(df, col_names):
    """
    Retains only specified columns in the input dataframe.

    Args:
        df (pandas.DataFrame): Input dataframe.
        col_names (list): List of names of columns to be retained.

    Returns:
        pandas.DataFrame: Dataframe with only specified columns.
    """
    # Check if all columns in col_names exist in df
    for col in col_names:
        if col not in df.columns:
            raise ValueError(f"Column {col} not found in dataframe.")

    # Select only the specified columns
    df = df.loc[:, col_names]

    return df

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
# from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np
def model_building(ob_factors, model_choice, date_range_ob_factors_df, train_test_validation_date_indexes, hours, date_index_level):
    ob_factors_copy = ob_factors.copy()
    print(f"ob_factors: {ob_factors.info()}\n")
    # vol_hour = vol_hour_loop(hours)
    # hours = 1

    vol_hour = check_within_date_range(date_range_ob_factors_df, hours)
    
    vol_hour['start_time'] = pd.to_datetime(vol_hour['start_time'], format='%H:%M:%S').dt.time
    vol_hour['end_time'] = pd.to_datetime(vol_hour['end_time'], format='%H:%M:%S').dt.time
    vol_hour.reset_index(inplace=True)
    vol_hour = vol_hour.drop('index', axis=1)
    ob_factors_copy['date'] = pd.to_datetime(ob_factors_copy['date'], format='%Y-%m-%d')
    ob_factors_copy['transacttime'] = pd.to_datetime(ob_factors_copy['transacttime'], format='%H:%M:%S').dt.time
    
    vol_hour['train'] = 0
    vol_hour['validate'] = 0
    vol_hour['test'] = 0

    for a in range(len(vol_hour)):
      if a < date_index_level:
        vol_hour.loc[a,'train'] = 1
      elif a == date_index_level:
          vol_hour.loc[a,'validate'] = 1
      elif a > date_index_level:
        vol_hour.loc[a,'test'] = 1

    # We take only training data to train model
    vol_hour_train = vol_hour[vol_hour['train']==1]
    # vol_hour_validate = vol_hour[date_index_level:date_index_level+1]
    # vol_hour_test = vol_hour[date_index_level+1:]

    # print(f"vol_hour: {vol_hour}")
    # print(f"vol_hour_train: {vol_hour_train}")
    xy = filter_xy_by_datetime_range(ob_factors_copy, vol_hour_train)
    print(f"Length of xy: {len(xy)}\n")
    print(f"xy.columns: {xy.columns}\n")
    # xy = xy.iloc[:, 1:]

    # seconds_in_1_hour = 60*60
    # xy_train = xy[:date_index_level*seconds_in_1_hour]
    # xy_val = xy[date_index_level*seconds_in_1_hour:date_index_level*(1+seconds_in_1_hour)]
    # xy_test = xy[date_index_level*(1+seconds_in_1_hour):]
    # Using the indexes, we split the data into the dates that will be used for training, testing and validation
    # 4 Means it splits into testing training and validation
    # unique_trading_hours_df = get_unique_trading_hours(xy)

    # date_indexes = get_index_in_ob_factors(train_test_validation_date_indexes)

    # # for sliced_data in sliced_data_dfs:
    # # df_modified = sliced.loc[max_index:max_index+duration, ['delp', 'Mean Reversion Lag', 'Level 1 Order Flow Imbalance', 'Level 2 Order Flow Imbalance', 'Level 3 Order Flow Imbalance', 'Level 4 Order Flow Imbalance', 'Level 5 Order Flow Imbalance', 'Level 6 Order Flow Imbalance', 'Level 7 Order Flow Imbalance', 'Level 8 Order Flow Imbalance', 'Level 9 Order Flow Imbalance', 'Level 10 Order Flow Imbalance']]
    x_columns = ['netbuy', 'mean_rev_lag', 'level1ofi', 'level2ofi', 'level3ofi',
                'level4ofi', 'level5ofi', 'level6ofi', 'level7ofi', 'level8ofi',
                'level9ofi', 'level10ofi']
    x_train = xy[x_columns]
    y_train = xy[['delp']]
    # X = xy[x_columns]
    # y = xy[['delp']]
    
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    # print(f"XXXXXXXXXXX\nx_train: {X_train.columns}\ny_train: {y_train.columns}\nXXXXXXXXXXX\n")
    # 2nd copy of x_train y_train
    # other_columns = [col for col in xy.columns if col not in x_columns]
    other_df = xy.copy()
    # Rename first column
    # other_df = other_df.rename(index={0: "Inner Index"})
    # other_df['Inner Index'] = other_df['Inner Index'] - other_df['Inner Index']
    # x_train_snapshot = xy[x_columns2]
    # x_train, y_train = data_

    if model_choice == 'linear_regression':
        # Linear Regression
        model = LinearRegression()
        model.fit(x_train, y_train)
        # lr_r2 = round(model.score(X_train, y_train), 5)
        # print(f"XXXXXXXXXXX\nLinear Regression training R-squared: {lr_r2}")
        # y_pred_test = model.predict(X_test)
        # r2 = r2_score(y_test, y_pred_test)
        # print(f"Linear Regression testing and predicting R-squared: {r2}\nXXXXXXXXXXX\n")
    elif model_choice == 'ridge_regression':
        # Ridge Regression
        model = Ridge(alpha=1)
        model.fit(x_train, y_train)
        ridge_r2 = round(model.score(x_train, y_train), 5)
        print("Ridge Regression R-squared: ", ridge_r2)
    elif model_choice == 'svr':
        # SVR
        model = SVR(kernel='linear')
        # model.fit(x_train, y_train)
        # # svr_r2 = round(model.score(x_train, y_train), 5)
        # # print("SVM Regression R-squared: ", svr_r2)
    elif model_choice == 'knn':
        # KNN Regressor
        model = KNeighborsRegressor(n_neighbors=5)
        model.fit(x_train, y_train)
        knn_r2 = round(model.score(x_train, y_train), 5)
        print("KNN Regression R-squared: ", knn_r2)
    elif model_choice == 'rf':
        # Random Forest Regressor
        model = RandomForestRegressor(n_estimators=75, random_state=42)
        model.fit(x_train, y_train.values.ravel())
        rf_r2 = round(model.score(x_train, y_train), 5)
        print("Random Forest Regression R-squared: ", rf_r2)


    # Predict on the test data
    
    #r2_test = r2_score(y_test, y_pred_test)
    #adjusted_r2_test = 1 - (1-model.score(X_test, y_test))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
    #r2_temp.append(r2)

    # Calculate the mean squared error
    #mse_test = mean_squared_error(y_test, y_pred_test)
    #mse_temp.append(mse)

    # y_pred_test = model.predict(test)
    
    # return y_pred_test
    return model, vol_hour, x_train, y_train, other_df


In [488]:
# vol_hour = readparquet(path+"/"+str(file_names['VOLATILITY_HOUR'][0]))
# vol_hour

In [489]:
# # xy_val
# 'datetime', 'date', 'transacttime',
#        'level_10_bid_quantity', 'level_10_bid_orders', 'level_10_bid_price',
#        'level_9_bid_quantity', 'level_9_bid_orders', 'level_9_bid_price',
#        'level_8_bid_quantity', 'level_8_bid_orders', 'level_8_bid_price',
#        'level_7_bid_quantity', 'level_7_bid_orders', 'level_7_bid_price',
#        'level_6_bid_quantity', 'level_6_bid_orders', 'level_6_bid_price',
#        'level_5_bid_quantity', 'level_5_bid_orders', 'level_5_bid_price',
#        'level_4_bid_quantity', 'level_4_bid_orders', 'level_4_bid_price',
#        'level_3_bid_quantity', 'level_3_bid_orders', 'level_3_bid_price',
#        'level_2_bid_quantity', 'level_2_bid_orders', 'level_2_bid_price',
#        'level_1_bid_quantity', 'level_1_bid_orders', 'level_1_bid_price',
#        'level_1_ask_price', 'level_1_ask_orders', 'level_1_ask_quantity',
#        'level_2_ask_price', 'level_2_ask_orders', 'level_2_ask_quantity',
#        'level_3_ask_price', 'level_3_ask_orders', 'level_3_ask_quantity',
#        'level_4_ask_price', 'level_4_ask_orders', 'level_4_ask_quantity',
#        'level_5_ask_price', 'level_5_ask_orders', 'level_5_ask_quantity',
#        'level_6_ask_price', 'level_6_ask_orders', 'level_6_ask_quantity',
#        'level_7_ask_price', 'level_7_ask_orders', 'level_7_ask_quantity',
#        'level_8_ask_price', 'level_8_ask_orders', 'level_8_ask_quantity',
#        'level_9_ask_price', 'level_9_ask_orders', 'level_9_ask_quantity',
#        'level_10_ask_price', 'level_10_ask_orders', 'level_10_ask_quantity'

In [474]:
# impo
model_choice = ['linear_regression','rf']
train_test_validation_date_indexes = 0
date_index_level = 12
hours = 1
# date_range_ob_factors_df_copy = date_range_ob_factors_df.copy()
# date_range_ob_factors_df_copy = date_range_ob_factors_df_copy[:-8]
# By modifying the date range_ob_factors_df, we can 
model, vol_hour, x_train, y_train, other_df = model_building(ob_factors, model_choice[1], date_range_ob_factors_df, train_test_validation_date_indexes, hours, date_index_level)
vol_hour

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1199800 entries, 0 to 1199799
Data columns (total 79 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1199800 non-null  int64  
 1   datetime               1199800 non-null  object 
 2   date                   1199800 non-null  object 
 3   transacttime           1199800 non-null  object 
 4   level_10_bid_quantity  1199800 non-null  float64
 5   level_10_bid_orders    1199800 non-null  float64
 6   level_10_bid_price     1199800 non-null  float64
 7   level_9_bid_quantity   1199800 non-null  float64
 8   level_9_bid_orders     1199800 non-null  float64
 9   level_9_bid_price      1199800 non-null  float64
 10  level_8_bid_quantity   1199800 non-null  float64
 11  level_8_bid_orders     1199800 non-null  float64
 12  level_8_bid_price      1199800 non-null  float64
 13  level_7_bid_quantity   1199800 non-null  float64
 14  level_7_bid_orders

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xy_date_rows.loc[:, 'delp'] = temp5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xy_date_rows.loc[:, 'delp'] = temp5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xy_date_rows.loc[:, 'delp'] = temp5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col

Length of xy: 43212

xy.columns: Index(['Unnamed: 0', 'datetime', 'date', 'transacttime',
       'level_10_bid_quantity', 'level_10_bid_orders', 'level_10_bid_price',
       'level_9_bid_quantity', 'level_9_bid_orders', 'level_9_bid_price',
       'level_8_bid_quantity', 'level_8_bid_orders', 'level_8_bid_price',
       'level_7_bid_quantity', 'level_7_bid_orders', 'level_7_bid_price',
       'level_6_bid_quantity', 'level_6_bid_orders', 'level_6_bid_price',
       'level_5_bid_quantity', 'level_5_bid_orders', 'level_5_bid_price',
       'level_4_bid_quantity', 'level_4_bid_orders', 'level_4_bid_price',
       'level_3_bid_quantity', 'level_3_bid_orders', 'level_3_bid_price',
       'level_2_bid_quantity', 'level_2_bid_orders', 'level_2_bid_price',
       'level_1_bid_quantity', 'level_1_bid_orders', 'level_1_bid_price',
       'level_1_ask_price', 'level_1_ask_orders', 'level_1_ask_quantity',
       'level_2_ask_price', 'level_2_ask_orders', 'level_2_ask_quantity',
       'level_3_ask

Unnamed: 0,trade_date,volatility,start_time,end_time,hour,date_end_time,date_start_time,within_date_range,train,validate,test
0,2020-01-29,0.000112,13:00:00,14:00:00,13.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
1,2020-03-18,0.000365,13:00:00,14:00:00,13.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
2,2020-04-29,0.000129,13:00:00,14:00:00,13.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
3,2020-06-10,0.000167,13:00:00,14:00:00,13.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
4,2020-07-29,8.8e-05,13:00:00,14:00:00,13.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
5,2020-08-26,7.1e-05,12:00:00,13:00:00,12.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
6,2020-09-16,0.00013,13:00:00,14:00:00,13.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
7,2020-11-03,0.000232,18:00:00,19:00:00,18.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
8,2020-12-16,0.000122,13:00:00,14:00:00,13.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
9,2021-01-27,8.6e-05,13:00:00,14:00:00,13.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0


In [475]:
model

RandomForestRegressor(n_estimators=75, random_state=42)

In [476]:
other_df

Unnamed: 0.1,Unnamed: 0,datetime,date,transacttime,level_10_bid_quantity,level_10_bid_orders,level_10_bid_price,level_9_bid_quantity,level_9_bid_orders,level_9_bid_price,level_8_bid_quantity,level_8_bid_orders,level_8_bid_price,level_7_bid_quantity,level_7_bid_orders,level_7_bid_price,level_6_bid_quantity,level_6_bid_orders,level_6_bid_price,level_5_bid_quantity,level_5_bid_orders,level_5_bid_price,level_4_bid_quantity,level_4_bid_orders,level_4_bid_price,level_3_bid_quantity,level_3_bid_orders,level_3_bid_price,level_2_bid_quantity,level_2_bid_orders,level_2_bid_price,level_1_bid_quantity,level_1_bid_orders,level_1_bid_price,level_1_ask_price,level_1_ask_orders,level_1_ask_quantity,level_2_ask_price,level_2_ask_orders,level_2_ask_quantity,level_3_ask_price,level_3_ask_orders,level_3_ask_quantity,level_4_ask_price,level_4_ask_orders,level_4_ask_quantity,level_5_ask_price,level_5_ask_orders,level_5_ask_quantity,level_6_ask_price,level_6_ask_orders,level_6_ask_quantity,level_7_ask_price,level_7_ask_orders,level_7_ask_quantity,level_8_ask_price,level_8_ask_orders,level_8_ask_quantity,level_9_ask_price,level_9_ask_orders,level_9_ask_quantity,level_10_ask_price,level_10_ask_orders,level_10_ask_quantity,netbuy,mean_rev_lag,level1ofi,level2ofi,level3ofi,level4ofi,level5ofi,level6ofi,level7ofi,level8ofi,level9ofi,level10ofi,midprice,delp,prev_midprice
0,72000,2020-01-29 13:00:00,2020-01-29,13:00:00,1060.0,6.0,130.703125,535.0,9.0,130.718750,757.0,6.0,130.734375,503.0,10.0,130.750000,1069.0,11.0,130.765625,232.0,7.0,130.781250,22.0,8.0,130.796875,30.0,15.0,130.812500,392.0,20.0,130.828125,45.0,1.0,130.843750,130.859375,23.0,310.0,130.875000,20.0,228.0,130.890625,22.0,331.0,130.906250,9.0,403.0,130.921875,21.0,286.0,130.937500,15.0,640.0,130.953125,8.0,212.0,130.968750,13.0,558.0,130.984375,27.0,179.0,131.000000,49.0,939.0,-19.0,0.441578,96.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.851562,0.000000,130.851562
1,72001,2020-01-29 13:00:01,2020-01-29,13:00:01,1060.0,6.0,130.703125,535.0,9.0,130.718750,757.0,6.0,130.734375,503.0,10.0,130.750000,1069.0,11.0,130.765625,232.0,7.0,130.781250,22.0,8.0,130.796875,30.0,15.0,130.812500,392.0,20.0,130.828125,78.0,4.0,130.843750,130.859375,23.0,249.0,130.875000,20.0,228.0,130.890625,22.0,331.0,130.906250,9.0,403.0,130.921875,21.0,286.0,130.937500,15.0,640.0,130.953125,8.0,212.0,130.968750,13.0,558.0,130.984375,27.0,179.0,131.000000,49.0,939.0,-63.0,0.353262,-8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.851562,0.000000,130.851562
2,72002,2020-01-29 13:00:02,2020-01-29,13:00:02,1060.0,6.0,130.703125,535.0,9.0,130.718750,757.0,6.0,130.734375,503.0,10.0,130.750000,1069.0,11.0,130.765625,232.0,7.0,130.781250,22.0,8.0,130.796875,30.0,15.0,130.812500,393.0,21.0,130.828125,72.0,5.0,130.843750,130.859375,22.0,233.0,130.875000,21.0,244.0,130.890625,22.0,331.0,130.906250,9.0,403.0,130.921875,21.0,286.0,130.937500,15.0,640.0,130.953125,8.0,212.0,130.968750,13.0,558.0,130.984375,27.0,179.0,131.000000,49.0,939.0,0.0,0.282610,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.851562,0.000000,130.851562
3,72003,2020-01-29 13:00:03,2020-01-29,13:00:03,1063.0,7.0,130.703125,538.0,10.0,130.718750,760.0,7.0,130.734375,506.0,11.0,130.750000,1080.0,12.0,130.765625,232.0,7.0,130.781250,22.0,8.0,130.796875,30.0,15.0,130.812500,393.0,21.0,130.828125,72.0,5.0,130.843750,130.859375,27.0,252.0,130.875000,20.0,228.0,130.890625,22.0,331.0,130.906250,9.0,403.0,130.921875,21.0,286.0,130.937500,15.0,640.0,130.953125,9.0,215.0,130.968750,14.0,561.0,130.984375,28.0,182.0,131.000000,50.0,942.0,21.0,0.226088,1.0,21.0,0.0,3.0,27.0,19.0,16.0,16.0,16.0,13.0,130.851562,0.000000,130.851562
4,72004,2020-01-29 13:00:04,2020-01-29,13:00:04,1091.0,9.0,130.703125,577.0,13.0,130.718750,779.0,9.0,130.734375,525.0,13.0,130.750000,1088.0,13.0,130.765625,251.0,9.0,130.781250,33.0,9.0,130.796875,150.0,16.0,130.812500,414.0,27.0,130.828125,48.0,4.0,130.843750,130.859375,25.0,243.0,130.875000,20.0,228.0,130.890625,22.0,331.0,130.906250,10.0,411.0,130.921875,22.0,294.0,130.937500,17.0,659.0,130.953125,11.0,234.0,130.968750,16.0,580.0,130.984375,31.0,221.0,131.000000,50.0,957.0,0.0,0.180870,20.0,-3.0,120.0,3.0,0.0,0.0,0.0,-178.0,0.0,-8.0,130.851562,0.000000,130.851562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43207,910803,2021-04-28 13:59:56,2021-04-28,13:59:56,2510.0,179.0,131.953125,2569.0,181.0,131.968750,2585.0,167.0,131.984375,4520.0,178.0,132.000000,2953.0,219.0,132.015625,3497.0,212.0,132.031250,3757.0,197.0,132.046875,3784.0,153.0,132.062500,3097.0,240.0,132.078125,1894.0,120.0,132.093750,132.109375,128.0,2006.0,132.125000,170.0,2175.0,132.140625,235.0,3577.0,132.156250,204.0,2775.0,132.171875,160.0,2514.0,132.187500,143.0,2918.0,132.203125,129.0,2666.0,132.218750,124.0,4102.0,132.234375,122.0,3652.0,132.250000,143.0,5780.0,262.0,0.430399,-341.0,838.0,632.0,41.0,-16.0,-3.0,3.0,0.0,3.0,7.0,132.101562,0.000000,132.101562
43208,910804,2021-04-28 13:59:57,2021-04-28,13:59:57,2509.0,178.0,131.953125,2572.0,182.0,131.968750,2585.0,167.0,131.984375,4520.0,178.0,132.000000,2800.0,203.0,132.015625,3497.0,212.0,132.031250,3755.0,198.0,132.046875,4109.0,152.0,132.062500,3047.0,235.0,132.078125,1911.0,111.0,132.093750,132.109375,44.0,1711.0,132.125000,170.0,2510.0,132.140625,238.0,3572.0,132.156250,204.0,2723.0,132.171875,163.0,2530.0,132.187500,143.0,2918.0,132.203125,129.0,2666.0,132.218750,124.0,4102.0,132.234375,122.0,3652.0,132.250000,142.0,5773.0,104.0,0.344320,90.0,-9.0,-383.0,-5.0,7.0,-12.0,0.0,0.0,0.0,0.0,132.101562,0.000000,132.101562
43209,910805,2021-04-28 13:59:58,2021-04-28,13:59:58,2509.0,178.0,131.953125,2572.0,182.0,131.968750,2585.0,167.0,131.984375,4520.0,178.0,132.000000,2788.0,202.0,132.015625,3497.0,212.0,132.031250,3755.0,198.0,132.046875,3735.0,153.0,132.062500,3062.0,239.0,132.078125,2155.0,122.0,132.093750,132.109375,55.0,1861.0,132.125000,173.0,2534.0,132.140625,239.0,3581.0,132.156250,203.0,2728.0,132.171875,162.0,2523.0,132.187500,143.0,2918.0,132.203125,129.0,2666.0,132.218750,124.0,4102.0,132.234375,122.0,3652.0,132.250000,142.0,5773.0,-1008.0,0.275456,-6786.0,-7985.0,-6437.0,-6933.0,-6182.0,-5302.0,-7418.0,-5251.0,-6674.0,-6137.0,132.101562,0.000000,132.101562
43210,910806,2021-04-28 13:59:59,2021-04-28,13:59:59,2758.0,165.0,131.937500,2512.0,177.0,131.953125,2572.0,182.0,131.968750,2588.0,168.0,131.984375,4511.0,177.0,132.000000,2803.0,205.0,132.015625,3505.0,214.0,132.031250,4398.0,197.0,132.046875,2749.0,145.0,132.062500,1591.0,149.0,132.078125,132.093750,76.0,2343.0,132.109375,139.0,3605.0,132.125000,175.0,3332.0,132.140625,246.0,3186.0,132.156250,202.0,2708.0,132.171875,158.0,2505.0,132.187500,140.0,2901.0,132.203125,129.0,2666.0,132.218750,125.0,4108.0,132.234375,120.0,3640.0,199.0,-0.579635,1972.0,328.0,-398.0,-19.0,-32.0,-1.0,0.0,0.0,3.0,-27.0,132.085938,0.000000,132.101562


In [348]:
12*60*60

43200

In [477]:
# y_train

In [478]:
x_train

Unnamed: 0,netbuy,mean_rev_lag,level1ofi,level2ofi,level3ofi,level4ofi,level5ofi,level6ofi,level7ofi,level8ofi,level9ofi,level10ofi
0,-19.0,0.441578,96.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-63.0,0.353262,-8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.282610,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,21.0,0.226088,1.0,21.0,0.0,3.0,27.0,19.0,16.0,16.0,16.0,13.0
4,0.0,0.180870,20.0,-3.0,120.0,3.0,0.0,0.0,0.0,-178.0,0.0,-8.0
...,...,...,...,...,...,...,...,...,...,...,...,...
43207,262.0,0.430399,-341.0,838.0,632.0,41.0,-16.0,-3.0,3.0,0.0,3.0,7.0
43208,104.0,0.344320,90.0,-9.0,-383.0,-5.0,7.0,-12.0,0.0,0.0,0.0,0.0
43209,-1008.0,0.275456,-6786.0,-7985.0,-6437.0,-6933.0,-6182.0,-5302.0,-7418.0,-5251.0,-6674.0,-6137.0
43210,199.0,-0.579635,1972.0,328.0,-398.0,-19.0,-32.0,-1.0,0.0,0.0,3.0,-27.0


In [479]:
# x_train
# y_train
temp = vol_hour.copy()
temp = temp.drop(['date_end_time', 'date_start_time', 'within_date_range'], axis=1)
col_names = [('trade_date','Date'),('volatility','Volatility'),('start_time','Start Time'),('end_time','End Time'),('hour','Hour'),('train','Train'),('validate','Validate'),('test','Test')]
temp = rename_column(temp,col_names)
temp = temp[['Date','Start Time','End Time','Hour','Volatility']]
temp
# vol_hour['date_end_time'][0]
# model
# ob_factors['datetime'][0]

Unnamed: 0,Date,Start Time,End Time,Hour,Volatility
0,2020-01-29,13:00:00,14:00:00,13.0,0.000112
1,2020-03-18,13:00:00,14:00:00,13.0,0.000365
2,2020-04-29,13:00:00,14:00:00,13.0,0.000129
3,2020-06-10,13:00:00,14:00:00,13.0,0.000167
4,2020-07-29,13:00:00,14:00:00,13.0,8.8e-05
5,2020-08-26,12:00:00,13:00:00,12.0,7.1e-05
6,2020-09-16,13:00:00,14:00:00,13.0,0.00013
7,2020-11-03,18:00:00,19:00:00,18.0,0.000232
8,2020-12-16,13:00:00,14:00:00,13.0,0.000122
9,2021-01-27,13:00:00,14:00:00,13.0,8.6e-05


In [480]:
temp2 = date_range_ob_factors_df.copy()
# temp = temp.drop(['date_end_time', 'date_start_time', 'within_date_range'], axis=1)
col_names = [('date','Date'),('min_time','Start Time'),('max_time','End Time'),('no_of_ob_updates','# of seconds')]
temp2 = rename_column(temp2,col_names)
# temp = temp[['Trade Date','Start Time','End Time','Hour','Volatility']]
temp2

Unnamed: 0,Date,Start Time,End Time,# of seconds
0,2020-01-28,17:00:00,23:59:59,25200
1,2020-01-29,00:00:00,16:00:00,57601
2,2020-03-17,17:00:00,23:59:57,25198
3,2020-03-18,00:00:00,16:00:00,57601
4,2020-04-29,00:00:00,16:00:00,57601
5,2020-06-09,17:00:00,23:59:59,25200
6,2020-06-10,00:00:00,16:00:00,57601
7,2020-07-28,17:00:00,23:59:57,25198
8,2020-07-29,00:00:00,16:00:00,57601
9,2020-08-25,17:00:00,23:59:59,25200


In [481]:
from datetime import datetime, timedelta

def get_val_data(temp, ob_factors):
    # Get the trade_date, start_time and end_time from the row
    trade_date = temp['Date'][12]
    # trade_date = trade_date.date()
    start_time = temp['Start Time'][12]
    end_time = temp['End Time'][12]
    # print(f"trade_date: {trade_date}")
    # print(f"start_time: {start_time}")
    # # print(f"end_time: {end_time}")
    # start_time = str(pd.to_datetime(str(trade_date) + " " + str(start_time)))
    # end_time = str(pd.to_datetime(str(trade_date) + " " + str(end_time)))

    start_time = str(pd.to_datetime(str(trade_date) + " " + str(start_time)))
    start_time = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S') # convert string to datetime object
    start_time = start_time - timedelta(minutes=30) # add 30 minutes
    start_time = start_time.strftime('%Y-%m-%d %H:%M:%S') # convert datetime object back to string

    end_time = str(pd.to_datetime(str(trade_date) + " " + str(end_time)))
    end_time = datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S') # convert string to datetime object
    end_time = end_time + timedelta(minutes=30) # add 30 minutes
    end_time = end_time.strftime('%Y-%m-%d %H:%M:%S') # convert datetime object back to string

    validation_data = window_slicer(ob_factors, start_time, end_time)
    validation_data.reset_index(inplace=True)
    validation_data = validation_data.drop('index', axis=1)

    return validation_data

validation_data = get_val_data(temp, ob_factors)
# validation_data

In [482]:
# x_train: Index(['netbuy', 'mean_rev_lag', 'midprice', 'level1ofi', 'level2ofi',
#        'level3ofi', 'level4ofi', 'level5ofi', 'level6ofi', 'level7ofi',
#        'level8ofi', 'level9ofi', 'level10ofi'],
#       dtype='object')
# y_train: Index(['delp'], dtype='object')

In [18]:
replenishment_dict_to_add = {'bids': [{'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
                                    {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0}],
                                    'asks': [{'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0.0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0.0},
                                    {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
                                    {'price': 0.0, 'quantity': 0.0, 'orders': 0}],
                                    'start_time': start_time,
                                    'end_time': end_time,
                                    'replenishment': 1}

replenishment_dict_to_add

{'bids': [{'price': 0.0, 'quantity': 0, 'orders': 0},
  {'price': 0.0, 'quantity': 0, 'orders': 0},
  {'price': 0.0, 'quantity': 0, 'orders': 0},
  {'price': 0.0, 'quantity': 0, 'orders': 0},
  {'price': 0.0, 'quantity': 0, 'orders': 0},
  {'price': 0.0, 'quantity': 0, 'orders': 0},
  {'price': 0.0, 'quantity': 0, 'orders': 0},
  {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
  {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
  {'price': 0.0, 'quantity': 0, 'orders': 0}],
 'asks': [{'price': 0.0, 'quantity': 0, 'orders': 0},
  {'price': 0.0, 'quantity': 0, 'orders': 0},
  {'price': 0.0, 'quantity': 0, 'orders': 0},
  {'price': 0.0, 'quantity': 0, 'orders': 0},
  {'price': 0.0, 'quantity': 0, 'orders': 0},
  {'price': 0.0, 'quantity': 0, 'orders': 0},
  {'price': 0.0, 'quantity': 0, 'orders': 0.0},
  {'price': 0.0, 'quantity': 0, 'orders': 0.0},
  {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
  {'price': 0.0, 'quantity': 0.0, 'orders': 0}],
 'start_time': '2020-01-28 17:00:00',
 'end_

In [483]:
def replenished_or_not(replenishment_rates, level_indexes_neg_list, m2_list, start_time, end_time, completed_replenishment_dict):
    # Use these indexes to see if those levels have been replenished or not

    level_replenishments_counts = [0,0,0,0,0,0,0,0,0,0]
    
    # Iterate thru all level_indexes_neg_list[i] = level_indexes_neg which contains list of indexes which has negative quantity aka removing liquidity
    for i in range(len(level_indexes_neg_list)):
        # Extract level_indexes_neg
        level_indexes_neg = level_indexes_neg_list[i]
        for side in ['bids']:
            # Counter to keep track of total replenished levels
            level_replenishment_completed = 0
            # Keep counter for every level individually.
            
            # This list will be of length 10. why? Its pretty obvious
            
            # Iterate over all the levels which have a negative quantity in m2_list[i]
            for level in level_indexes_neg:
                
                # Extract replenish_dict to just check conditions,
                replenish_dict = m2_list[i][0][side][level]['replenish']
                # Check if replenishment is completed at that level
                if replenish_dict['replenishment_done'] == 1:
                    # Keep a cumulattive count of completion for whole side
                    level_replenishment_completed += 1
                    # level_replenishment_dict = replenish_dict
                    if replenish_dict['appended'] == 0:
                        completed_replenishment_dict[side][level].append(replenish_dict)
                        m2_list[i][0][side][level]['replenish']['appended'] = 1
                else:

                    level_replenishments_counts[level] += 1
                    # print(f"level_replenishments_counts[level]: {level_replenishments_counts[level]}")
                    # After the full loop, this will have total count for that level.
                    # We get the count because our replenishment rate is for a single order
                    # We multiply total count with rate to get replenishment quantity

            # If all replenishment completed at all possible levels for that m1_list[0]
            if level_replenishment_completed == len(level_indexes_neg):
                # Mark m1_list as completed
                m2_list[i][0]['replenishment_completed'] = 1
            # If replenishment not completed, check other side
            else:
                continue

    # orders will contain the count values
    # quantity will contain the multiplied value
    # replenishment_dict = replenishment_rates * level_replenishments_counts
    # Template with zeroes
    replenishment_dict_to_add = {'bids': [{'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
                                    {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0}],
                                    'asks': [{'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0.0},
                                    {'price': 0.0, 'quantity': 0, 'orders': 0.0},
                                    {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
                                    {'price': 0.0, 'quantity': 0.0, 'orders': 0}],
                                    'start_time': start_time,
                                    'end_time': end_time,
                                    'replenishment': 1}
    
    # If no replenishments to do, return dict with zeroes
    if sum(level_replenishments_counts) == 0:
        return replenishment_dict_to_add, completed_replenishment_dict

    for side in ['bids']:
        for index, level_dict in enumerate(replenishment_dict_to_add[side]):
            level_dict['quantity'] = replenishment_rates[index]*level_replenishments_counts[index]
            # print(f"Quantity: {level_dict['quantity']}")
            level_dict['orders'] = level_replenishments_counts[index]
            # print(f"orders: {level_dict['orders']}")

    return replenishment_dict_to_add, completed_replenishment_dict

def extract_incomplete_replenishments(m2_list, incomplete_replenishment_dict):
    # This function is called only once at the end
    # We get the indexes we need
    level_indexes_neg_list, level_count_neg_list = m2_list_neg_level_index(m2_list)
    for i in range(len(level_indexes_neg_list)):
        # Extract level_indexes_neg
        level_indexes_neg = level_indexes_neg_list[i]
        for side in ['bids']:
            # Counter to keep track of total replenished levels
            # level_replenishment_completed = 0
            # Iterate over all the negative levels in m2_list[i]
            for level in level_indexes_neg:
                
                # Extract replenish_dict to just check conditions,
                replenish_dict = m2_list[i][0][side][level]['replenish']
                
                # Check if replenishment is not completed at that level
                # All levels at the end will be only levels which are not fully replenished
                if replenish_dict['replenishment_done']==0:
                    # We check if this has already been added to the dict
                    if replenish_dict['appended'] == 0:
                        incomplete_replenishment_dict[side][level].append(replenish_dict)
                        m2_list[i][0][side][level]['replenish']['appended'] = 1

    return m2_list, incomplete_replenishment_dict

def replenish_past_algo_trades(level_indexes_neg_list, m2_list, m3_list, replenishment_quantity_dict, completed_replenishment_dict, incomplete_replenishment_dict):
    
    # Keep an original copy because if there is excess quantity after one m1_list
    # we will maintain this copy by reducing quantity everytime we replenish
    replenishment_quantity_dict_copy = replenishment_quantity_dict.copy()
    # This copy is to maintain the dictionary that will be added to the snapshot later
    
    # Pick the indexes list
    # i is also the index of m1_list in m2_list
    # You should be able to intuitively understand why it is the same
    for i in range(len(level_indexes_neg_list)):
        # Counter to keep track if there was excess quantity that can be used if there is a next loop. Else, we dont worry about it
        counter_for_excess_qty = 0

        # Extract negative_level_index
        negative_level_index = level_indexes_neg_list[i]
        for side in ['bids']:
            
            # print(f"\nWe are working on the {side} side.")
            # Iterate over all the negative levels in m2_list[i]
            for index, level in enumerate(negative_level_index):

                # print(f"\nWe are working on level: {index+1}...")
                # We enter a new level
                # print("\nWe enter a new level...")
                
                replenish_dict = m2_list[i][0][side][level]['replenish']
                
                if replenish_dict['replenishment_done'] == 1:
                    # print("Level already replenished, going to next level...")
                    # Skip that level since it has already been replenished
                    continue
            
                # print(replenish_dict)
                initial_quantity_of_negative_changes_sign_flipped = m2_list[i][0][side][level]['quantity']*(-1)
                # First will be 0. We access using last item in list. Understand intuition
                # Since we are accessing last item in the list, it will always be the latest replenishment percentage for that level
                percentage_before_replenishment = round(replenish_dict['replenishment_percentage'][-1], 3)
                # We multiply by -1 because quantity is negative in m2_list for these levels
                quantity_before_replenishment = round((percentage_before_replenishment*initial_quantity_of_negative_changes_sign_flipped))
                # this is the balance/excess of that level after replenishment so can be negative, 0 or positive.
                quantity_if_positive_changes_fully_added_as_replenishment = quantity_before_replenishment + replenishment_quantity_dict_copy[side][level]['quantity']
                
                # We again multiply be -1 because quantity is negative
                percentage_after_replenishment = round((quantity_if_positive_changes_fully_added_as_replenishment / (1*initial_quantity_of_negative_changes_sign_flipped)), 3)
                
                # # We use print to debug
                # print(f"initial_quantity_of_negative_changes_sign_flipped: {initial_quantity_of_negative_changes_sign_flipped}")

                # # We use print to debug
                # print(f"Percentage_before_replenishment: {percentage_before_replenishment}")

                # # We use print to debug
                # print(f"quantity_before_replenishment: {quantity_before_replenishment}")

                # # We use print to debug
                # print(f"positive_changes: {changes_copy[side][level]['quantity']}")

                # # We use print to debug
                # print(f"quantity_if_positive_changes_fully_added_as_replenishment: {quantity_if_positive_changes_fully_added_as_replenishment}\n")

                # print("We now check if we have less/perfect/excess replenishment...")
                if percentage_after_replenishment < 1: 
                # It means we need more qty to replenish that level, so we just create a new key with the replenishment qty and % details    
                # At that level, the positive change has fully replenished the first m1_list in m2_list
                    
                    # We use print to debug
                    # print(f"Replenishment INSUFFICIENT,Percentage_after_replenishment: {percentage_after_replenishment}")

                    # Replenished quantity is the difference
                    replenished_quantity = quantity_if_positive_changes_fully_added_as_replenishment - quantity_before_replenishment
                    
                    # We store the quantity that was replenished at that second
                    replenish_dict['replenishment_quantity'].append(replenished_quantity)

                    # We reduce the replenished qty from the changes_copy
                    # We do this because we want to keep track of how much is getting replenished from the current positive changes
                    replenishment_quantity_dict_copy[side][level]['quantity'] -= replenished_quantity
                    # The balance quantity that can be used to replenish older trades ranges from 0 to initial_quantity at that level in changes. 
                    # replenishment_quantity exhausted before negative change exhausted at that level
                    # Understand intuition

                    # Append time in the replenish_dict inside m1_list in m2_list
                    replenish_dict['time'].append(replenishment_quantity_dict['start_time']) 

                    # Append the percentage replenished
                    replenish_dict['replenishment_percentage'].append(percentage_after_replenishment)

                    # Make all changes on m2_list
                    m2_list[i][0][side][level]['replenish'] = replenish_dict
                    
                    # print(f"After changes to replenish_dict: {replenish_dict}")
                    # we go to the next level
                    continue

                elif percentage_after_replenishment == 1: 
                # it means quantity replenished by new change perfectly replenishes the negative quantity
                # key word: perfectly

                    # print(f"Replenishment PERFECT, Percentage_after_replenishment: {percentage_after_replenishment}")

                    # Replenished quantity is the difference
                    replenished_quantity = quantity_if_positive_changes_fully_added_as_replenishment - quantity_before_replenishment

                    # We store the quantity that was replenished at that second
                    replenish_dict['replenishment_quantity'].append(replenished_quantity)

                    # We reduce the replenished qty from the changes_copy
                    # We do this because we want to keep track of how much is getting replenished from the current positive changes
                    replenishment_quantity_dict_copy[side][level]['quantity'] -= replenished_quantity
                    # The balance quantity that can be used to replenish older trades is 0. 
                    # replenishment_quantity exhausted perfectly when negative change exhausted at that level
                    # Understand intuition

                    # Mark replenishment as done for that level
                    replenish_dict['replenishment_done'] = 1
                    
                    # Append time in the replenish_dict inside m1_list in m2_list
                    replenish_dict['time'].append(replenishment_quantity_dict['start_time']) 

                    # Append the percentage replenished
                    replenish_dict['replenishment_percentage'].append(percentage_after_replenishment)

                    # Make all changes on m2_list
                    m2_list[i][0][side][level]['replenish'] = replenish_dict

                    # We use print to debug
                    # print(f"After changes to replenish_dict: {replenish_dict}")

                    # we go to the next level
                    continue
                
                elif percentage_after_replenishment > 1: 
                # it means we have excess quantity after filling that level in the previous snapshot

                    # print(f"Replenishment EXCESS, Percentage_after_replenishment: {percentage_after_replenishment}")
                    # We keep whenever there is an overfill during replenishment
                    counter_for_excess_qty += 1

                    # Replenished quantity is the difference
                    # replenished_quantity = quantity_if_positive_changes_fully_added_as_replenishment - quantity_before_replenishment
                    # if we calculate using same way, excess replenishment_quantity will also be added to replenished_quantity
                    
                    # This is the positive quantity that is required to fill that level to 100% replenishment
                    # initial_quantity_of_negative_changes is negative so we just add
                    excess_quantity = replenishment_quantity_dict_copy[side][level]['quantity'] + initial_quantity_of_negative_changes_sign_flipped
                    # Understand intuition

                    # excess quantity_to_fully_replenish
                    quantity_to_fully_replenish = replenishment_quantity_dict_copy[side][level]['quantity'] - excess_quantity

                    # We use print to debug
                    # print(f"Excess quantity after replenishment: {excess_quantity}")
                    
                    # This is the quantity which will be replenished at that level
                    replenished_quantity = quantity_to_fully_replenish
                    # Understand intuition. That level is fully exhausted and there is excess quantity

                    # We store the quantity that was replenished at that second
                    replenish_dict['replenishment_quantity'].append(replenished_quantity)

                    # We reduce the replenished qty from the changes_copy
                    # We do this because we want to keep track of how much is getting replenished from the current positive changes
                    replenishment_quantity_dict_copy[side][level]['quantity'] -= replenished_quantity
                    # The balance quantity that can be used to replenish older trades is always greater than 0
                    # negative change exhausted before replenishment_quantity exhausted at that level
                    # Understand intuition

                    # Percentage calculated earlier will be larger than 1. You should intuitively know why that's the case
                    # So we set it to 1. signifying 100% replenishment
                    percentage_after_replenishment = 1.

                    # Mark replenishment as done for that level
                    replenish_dict['replenishment_done'] = 1

                    # Append time in the replenish_dict inside m1_list in m2_list
                    replenish_dict['time'].append(replenishment_quantity_dict['start_time']) 

                    # Append the percentage replenished
                    replenish_dict['replenishment_percentage'].append(percentage_after_replenishment)

                    # Make all changes on m2_list
                    m2_list[i][0][side][level]['replenish'] = replenish_dict

                    # This is the quantity that will be put on the the last quantity_replenished
                    quantity_after_replenishment = -1* initial_quantity_of_negative_changes_sign_flipped

                    # We use print to debug
                    # print(f"quantity_after_replenishment: {quantity_after_replenishment}")
                    # print(f"After changes to replenish_dict: {replenish_dict}")
                    # go to next level
                    continue
            
            # print(f"All levels over for {side}")
        

        # If there is no excess quantity at any level after repleneshing the first m1_list
        if counter_for_excess_qty==0:
            # print(f"We dont have any excess quantity left, so we don't iterate to next m1_list, we start checking if we can update m2_list and m3_list...")
            # Don't iterate to next common_level_index
            break
        
    # We have come out of the for loop, we have finished replenishment for all m1_lists[0] in m2_list
        
    # we check if any m1_list[0] in m2_list was fully replenished(all levels)
    # m3_list contains the fully replenished lists. it will be cut from m2_list and pasted to m3_list.
    # this way, m2_list only contains the lists than need to be replenished
    m2_list, m3_list, completed_replenishment_dict = check_for_fully_replenished_list(m2_list, m3_list, level_indexes_neg_list, completed_replenishment_dict)

    # We don't need this because we are not replenshing anything. We are just making list of all the replenishments that happen. That is in m3_list
    # replenishment_dict_to_be_added_to_snapshot = replenish_dict_calculator(ob_changes, temp_changes)
    
    return m2_list, m3_list, completed_replenishment_dict, incomplete_replenishment_dict

def create_dict_for_algo(snapshot, trade_signal, order_quantity, total_trades, avg_price, total_qty_liquidated):
    
    order_quantity_copy = order_quantity
    algo_dict_to_add = {'bids': [{'price': 0, 'quantity': 0, 'orders': 0},
                    {'price': 0, 'quantity': 0, 'orders': 0},
                    {'price': 0, 'quantity': 0, 'orders': 0},
                    {'price': 0, 'quantity': 0, 'orders': 0}, # add quantity for this and above levels, make sure at that second, the quantity wipes out the levels
                    {'price': 0, 'quantity': 0, 'orders': 0},
                    {'price': 0, 'quantity': 0, 'orders': 0},
                    {'price': 0, 'quantity': 0, 'orders': 0},
                    {'price': 0, 'quantity': 0, 'orders': 0},
                    {'price': 0, 'quantity': 0, 'orders': 0},
                    {'price': 0, 'quantity': 0, 'orders': 0}],
                    'asks': [{'price': 0, 'quantity': 0, 'orders': 0},
                    {'price': 0, 'quantity': 0, 'orders': 0},
                    {'price': 0, 'quantity': 0, 'orders': 0},
                    {'price': 0, 'quantity': 0, 'orders': 0},
                    {'price': 0, 'quantity': 0, 'orders': 0},
                    {'price': 0, 'quantity': 0, 'orders': 0},
                    {'price': 0, 'quantity': 0, 'orders': 0},
                    {'price': 0, 'quantity': 0, 'orders': 0},
                    {'price': 0, 'quantity': 0, 'orders': 0},
                    {'price': 0, 'quantity': 0, 'orders': 0}],
                    'start_time': snapshot['transacttime'],
                    'end_time': snapshot['transacttime'],
                    'trade': 0}
    if trade_signal == 0:
        # No trade, return zeros
        return algo_dict_to_add, total_trades, avg_price, total_qty_liquidated
    else:
        algo_dict_to_add['trade'] = 1
        for side in ['bids']:
            if snapshot['bids'][0]['quantity']==0:
                break
            for index, side_list in enumerate(algo_dict_to_add[side]):

                # see if the current level has enough qty,
                # if it does, execute,
                # if it does not,
                # execute all qty at level, then go to next level, keep trackof how much qty is left to take to next level
                quantity_at_level = snapshot[side][index]['quantity']
                price_at_level = snapshot[side][index]['price']
                if order_quantity <= quantity_at_level:
                    # -1 because we remove liquidity
                    side_list['quantity'] = -1*order_quantity
                    side_list['orders'] = 1
                    total_qty_liquidated.append(order_quantity)
                    print(f"price_at_level oq < ql: {price_at_level}")
                    avg_price.append(price_at_level)
                    break
                elif order_quantity > quantity_at_level:
                    balance_quantity = order_quantity_copy - quantity_at_level #(snapshot[side][index]['quantity'] - order_quantity_copy)*(1)
                    side_list['quantity'] = -1*order_quantity_copy
                    side_list['orders'] = 1
                    total_qty_liquidated.append(order_quantity_copy)
                    avg_price.append(price_at_level)
                    print(f"price_at_level oq < ql: {price_at_level}")
                    continue                    
    
    return algo_dict_to_add, total_trades, avg_price, total_qty_liquidated

def algo(snapshot, model, order_quantity, total_trades, avg_price, total_qty_liquidated, delp, model_choice): # should model be passed as input? or can it be directly called inside the function?

    trade_signal = 0 #default
    scalar_predicted_value = 0 #default
    # We get the snapshot
    # We extract the factors dict and put it in a array
    # print(f"snapshot factors: {snapshot['factors']}\n")
    factors = pd.DataFrame([snapshot['factors']], index=[0])
    # # We extract y and put it in a variable
    # print(f"factors: {factors.info()}\n")
    x = factors.drop('delp', axis=1)
    y = factors[['delp']]
    print(f"x: {x}\ny: {y}\n")
    print(f"x: {len(x['netbuy'])}\ny: {len(y['delp'])}\n")
    # The model would already be trained
    # x = x.values
    # We directly call .predict and decide if the algo is going to trade or not
    # print(f"x: {x}")
    # print(model.input_shape)
    predicted_value = model.predict(x)
    print(f"predicted_value rf: {predicted_value}\n")
    if model_choice == 'linear_regression':
        scalar_predicted_value = predicted_value[0][0]
        delp.append(scalar_predicted_value)
        print(f"predicted_value lin reg: {scalar_predicted_value}\n")
    elif model_choice == 'rf':
        delp.append(predicted_value)
        print(f"predicted_value rf: {predicted_value}\n")
    # delp.append(scalar_predicted_value)
    # If trade, quantity is fixed,
    # so write a function which takes in the snapshot and gives an output of the dictionary after seeing how many levels it goes to
    # make sure number of orders increases in the corresponding levels


    # window_sizes = 3600
    # dy_vol = pd.DataFrame(columns=[f"realised_volatility_{window_sizes}"])
    # max_index, dy_vol = dynamic_vol(dy_vol, df, window_sizes)
    # df_test = df.loc[0:len(df['transacttime'])+1, ['Mean Reversion Lag', 'Level 1 Order Flow Imbalance', 'Level 2 Order Flow Imbalance', 'Level 3 Order Flow Imbalance', 'Level 4 Order Flow Imbalance', 'Level 5 Order Flow Imbalance', 'Level 6 Order Flow Imbalance', 'Level 7 Order Flow Imbalance', 'Level 8 Order Flow Imbalance', 'Level 9 Order Flow Imbalance', 'Level 10 Order Flow Imbalance']]
    # delp = baseline_random_forest_model(df, max_index, window_sizes, df_test)

    # delp = pd.DataFrame(delp, columns = ['delp'])
    # result = dict.fromkeys(df['transacttime'], None)
    # for i in range(len(delp['delp'])):
    if scalar_predicted_value < 0: ## liquidate
        trade_signal = 1
        algo_dict_to_add, total_trades, avg_price, total_qty_liquidated = create_dict_for_algo(snapshot, trade_signal, order_quantity, total_trades, avg_price, total_qty_liquidated)
        # result[df['transacttime'][i]] = {'Trade': single_signal, 'Current price' : }
    else: #if scalar_predicted_value <= 0: ## hold
        trade_signal = 0
        algo_dict_to_add, total_trades, avg_price, total_qty_liquidated = create_dict_for_algo(snapshot, trade_signal, order_quantity, total_trades, avg_price, total_qty_liquidated)
        # trade_signal = 1
    return trade_signal, algo_dict_to_add, total_trades, avg_price, total_qty_liquidated, delp
      

In [484]:
model_choice = ['linear_regression','rf']
train_test_validation_date_indexes = 0
date_index_level = 12
hours = 1

model, vol_hour, x_train, y_train, other_df = model_building(ob_factors, model_choice[0], date_range_ob_factors_df, train_test_validation_date_indexes, hours, date_index_level)
vol_hour

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1199800 entries, 0 to 1199799
Data columns (total 79 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1199800 non-null  int64  
 1   datetime               1199800 non-null  object 
 2   date                   1199800 non-null  object 
 3   transacttime           1199800 non-null  object 
 4   level_10_bid_quantity  1199800 non-null  float64
 5   level_10_bid_orders    1199800 non-null  float64
 6   level_10_bid_price     1199800 non-null  float64
 7   level_9_bid_quantity   1199800 non-null  float64
 8   level_9_bid_orders     1199800 non-null  float64
 9   level_9_bid_price      1199800 non-null  float64
 10  level_8_bid_quantity   1199800 non-null  float64
 11  level_8_bid_orders     1199800 non-null  float64
 12  level_8_bid_price      1199800 non-null  float64
 13  level_7_bid_quantity   1199800 non-null  float64
 14  level_7_bid_orders

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xy_date_rows.loc[:, 'delp'] = temp5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xy_date_rows.loc[:, 'delp'] = temp5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xy_date_rows.loc[:, 'delp'] = temp5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col

Length of xy: 43212

xy.columns: Index(['Unnamed: 0', 'datetime', 'date', 'transacttime',
       'level_10_bid_quantity', 'level_10_bid_orders', 'level_10_bid_price',
       'level_9_bid_quantity', 'level_9_bid_orders', 'level_9_bid_price',
       'level_8_bid_quantity', 'level_8_bid_orders', 'level_8_bid_price',
       'level_7_bid_quantity', 'level_7_bid_orders', 'level_7_bid_price',
       'level_6_bid_quantity', 'level_6_bid_orders', 'level_6_bid_price',
       'level_5_bid_quantity', 'level_5_bid_orders', 'level_5_bid_price',
       'level_4_bid_quantity', 'level_4_bid_orders', 'level_4_bid_price',
       'level_3_bid_quantity', 'level_3_bid_orders', 'level_3_bid_price',
       'level_2_bid_quantity', 'level_2_bid_orders', 'level_2_bid_price',
       'level_1_bid_quantity', 'level_1_bid_orders', 'level_1_bid_price',
       'level_1_ask_price', 'level_1_ask_orders', 'level_1_ask_quantity',
       'level_2_ask_price', 'level_2_ask_orders', 'level_2_ask_quantity',
       'level_3_ask

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xy_date_rows.loc[:, 'delp'] = temp5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xy_date_rows.loc[:, 'delp'] = temp5


Unnamed: 0,trade_date,volatility,start_time,end_time,hour,date_end_time,date_start_time,within_date_range,train,validate,test
0,2020-01-29,0.000112,13:00:00,14:00:00,13.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
1,2020-03-18,0.000365,13:00:00,14:00:00,13.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
2,2020-04-29,0.000129,13:00:00,14:00:00,13.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
3,2020-06-10,0.000167,13:00:00,14:00:00,13.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
4,2020-07-29,8.8e-05,13:00:00,14:00:00,13.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
5,2020-08-26,7.1e-05,12:00:00,13:00:00,12.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
6,2020-09-16,0.00013,13:00:00,14:00:00,13.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
7,2020-11-03,0.000232,18:00:00,19:00:00,18.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
8,2020-12-16,0.000122,13:00:00,14:00:00,13.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0
9,2021-01-27,8.6e-05,13:00:00,14:00:00,13.0,2021-11-03 14:00:00,2021-11-03 13:00:00,Yes,1,0,0


In [486]:
# Replenishment rates for 10 levels of bid
# replenishment_rates = avg_replenishment_quantity_per_second_all_levels_all_dates.copy()

def model_selection(validation_data, order_quantity, replenishment_rates, model_choice):
    # Initialize a variable to hold the previous snapshot
    previous_snapshot = None

    snapshot_gen = order_book_snapshot_generator(validation_data, 10)
    c = 0
    m2_list = []
    m3_list = []
    total_trades = 0
    # List to store all txn prices
    avg_price = []
    # List to store all the trades the algo made
    total_qty_liquidated = []
    # List to store all predicted price change value
    delp = []
    # Initialize empty list of 10 lists for every level for both bid and ask. Bid first, then ask
    # This is the format we append and maintain just the replenished dicts for very level
    completed_replenishment_dict = {'bids': [[],[],[],[],[],[],[],[],[],[]],'asks': [[],[],[],[],[],[],[],[],[],[]]}
    incomplete_replenishment_dict = {'bids': [[],[],[],[],[],[],[],[],[],[]],'asks': [[],[],[],[],[],[],[],[],[],[]]}
    # This is how u access it
    # completed_replenishment_dict[side][i][j]['replenishment_done', 'time', 'replenishment_percentage']

    for snapshot in snapshot_gen:

        # If this is not the first snapshot, find the changes and print them
        if previous_snapshot is not None:

            # print(f"snapshot factors: {len(previous_snapshot['factors']['midprice'])}\n")
            # trade_signal = 0 # default
            
            trade_signal, algo_dict_to_add, total_trades, avg_price, total_qty_liquidated, delp = algo(previous_snapshot, model, order_quantity, total_trades, avg_price, total_qty_liquidated, delp, model_choice)
            total_trades += trade_signal
            delp
            # if c == 4 or c == 10:
            #     algo_dict_to_add = {'bids': [{'price': 0.0, 'quantity': -122, 'orders': 1.0},
            #                             {'price': 0.0, 'quantity': -583, 'orders': 1.0},
            #                             {'price': 0.0, 'quantity': -1044, 'orders': 1.0},
            #                             {'price': 0.0, 'quantity': -1000, 'orders': 1.0}, # add quantity for this and above levels, make sure at that second, the quantity wipes out the levels
            #                             {'price': 0.0, 'quantity': 0, 'orders': 0},
            #                             {'price': 0.0, 'quantity': 0, 'orders': 0},
            #                             {'price': 0.0, 'quantity': 0, 'orders': 0},
            #                             {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
            #                             {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
            #                             {'price': 0.0, 'quantity': 0, 'orders': 0}],
            #                             'asks': [{'price': 0.0, 'quantity': 0, 'orders': 0},
            #                             {'price': 0.0, 'quantity': 0, 'orders': 0},
            #                             {'price': 0.0, 'quantity': 0, 'orders': 0},
            #                             {'price': 0.0, 'quantity': 0, 'orders': 0},
            #                             {'price': 0.0, 'quantity': 0, 'orders': 0},
            #                             {'price': 0.0, 'quantity': 0, 'orders': 0},
            #                             {'price': 0.0, 'quantity': 0, 'orders': 0.0},
            #                             {'price': 0.0, 'quantity': 0, 'orders': 0.0},
            #                             {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
            #                             {'price': 0.0, 'quantity': 0.0, 'orders': 0}],
            #                             'start_time': '17:00:03',
            #                             'end_time': '17:00:04',
            #                             'trade': 1}
            # else:
            #     # Template with zeroes
            #     algo_dict_to_add = {'bids': [{'price': 0.0, 'quantity': 0, 'orders': 0},
                                            # {'price': 0.0, 'quantity': 0, 'orders': 0},
                                            # {'price': 0.0, 'quantity': 0, 'orders': 0},
                                            # {'price': 0.0, 'quantity': 0, 'orders': 0},
                                            # {'price': 0.0, 'quantity': 0, 'orders': 0},
                                            # {'price': 0.0, 'quantity': 0, 'orders': 0},
                                            # {'price': 0.0, 'quantity': 0, 'orders': 0},
                                            # {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
                                            # {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
                                            # {'price': 0.0, 'quantity': 0, 'orders': 0}],
                                            # 'asks': [{'price': 0.0, 'quantity': 0, 'orders': 0},
                                            # {'price': 0.0, 'quantity': 0, 'orders': 0},
                                            # {'price': 0.0, 'quantity': 0, 'orders': 0},
                                            # {'price': 0.0, 'quantity': 0, 'orders': 0},
                                            # {'price': 0.0, 'quantity': 0, 'orders': 0},
                                            # {'price': 0.0, 'quantity': 0, 'orders': 0},
                                            # {'price': 0.0, 'quantity': 0, 'orders': 0.0},
                                            # {'price': 0.0, 'quantity': 0, 'orders': 0.0},
                                            # {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
                                            # {'price': 0.0, 'quantity': 0.0, 'orders': 0}],
                                            # 'start_time': previous_snapshot['transacttime'],
                                            # 'end_time': previous_snapshot['transacttime'],
                                            # 'trade': 0}

            replenishment_dict_to_add = {'bids': [{'price': 0, 'quantity': 0, 'orders': 0},
                        {'price': 0, 'quantity': 0, 'orders': 0},
                        {'price': 0, 'quantity': 0, 'orders': 0},
                        {'price': 0, 'quantity': 0, 'orders': 0}, # add quantity for this and above levels, make sure at that second, the quantity wipes out the levels
                        {'price': 0, 'quantity': 0, 'orders': 0},
                        {'price': 0, 'quantity': 0, 'orders': 0},
                        {'price': 0, 'quantity': 0, 'orders': 0},
                        {'price': 0, 'quantity': 0, 'orders': 0},
                        {'price': 0, 'quantity': 0, 'orders': 0},
                        {'price': 0, 'quantity': 0, 'orders': 0}],
                        'asks': [{'price': 0, 'quantity': 0, 'orders': 0},
                        {'price': 0, 'quantity': 0, 'orders': 0},
                        {'price': 0, 'quantity': 0, 'orders': 0},
                        {'price': 0, 'quantity': 0, 'orders': 0},
                        {'price': 0, 'quantity': 0, 'orders': 0},
                        {'price': 0, 'quantity': 0, 'orders': 0},
                        {'price': 0, 'quantity': 0, 'orders': 0},
                        {'price': 0, 'quantity': 0, 'orders': 0},
                        {'price': 0, 'quantity': 0, 'orders': 0},
                        {'price': 0, 'quantity': 0, 'orders': 0}],
                        'start_time': snapshot['transacttime'],
                        'end_time': snapshot['transacttime'],
                        'replenishment': 0}
            # We use print to debug
            # print(f"transacttime: {previous_snapshot['transacttime']}")
            # print(f"Algo trade? {trade_signal}\n")
            # print(f"Snapshot bids: {previous_snapshot['bids']}")
        
            changes = find_order_book_changes(previous_snapshot, snapshot)
            
            # print(f"start_time: {changes['start_time']}\nend_time: {changes['end_time']}")
            # print(f"Changes by market data: {changes}\n")

            # # algo_dict_to_add = algorithm(snapshot, metrics)

            if algo_dict_to_add['trade'] == 1:
                # print(f"trade_signal: {str(trade_signal)}")
                # print(f"Algo dict: {algo_dict_to_add}\n")
                t2, created_or_not = check_negative_quantity_add_keys(algo_dict_to_add)
            else:
                created_or_not = False
                
            # else:
                # print(f"trade_signal: {str(trade_signal)}")
                # print("Algo did not Trade so dict full of zeroes gets added with replenishment!!!\n")

            # m2_list contains the all the trades done by algorithm which are not fully replenished yet
            # if m2_list is not empty,
            if m2_list:
                # print(f"We can now check if we can replenish negative quantity changes that happened in the past...")
                # Get the levels of negative quantity in m1_list[0] for all m1_list in m2_list
                level_indexes_neg_list, level_count_neg_list = m2_list_neg_level_index(m2_list)

                # We call this function which calculates the dict which needs to be added to the snapshot
                # completed_replenishment_dict is just the list we maintain to see later
                replenishment_dict_to_add, completed_replenishment_dict = replenished_or_not(replenishment_rates, level_indexes_neg_list, m2_list, changes['start_time'], changes['end_time'], completed_replenishment_dict)
                # print(f"replenishment_dict_to_add: {replenishment_dict_to_add}\n")
                m2_list, m3_list, completed_replenishment_dict, incomplete_replenishment_dict = replenish_past_algo_trades(level_indexes_neg_list, m2_list, m3_list, replenishment_dict_to_add, completed_replenishment_dict, incomplete_replenishment_dict)

            # We put this condition here because we want to append it to the list after the past trades have been replenished
            # Intuition is that current second trade cannot be replenished immediately
            # If we added some new key because there was a negative change, we add that to m2_list
            if created_or_not == True:
                m2_list = check_negative_quantity_append_list(t2, m2_list)
                    # printf(f"Length of m3_list: ")
            # print(changes)

            # else:
            #     continue

            # So now we have replenishment_dict_to_add, algo_dict_to_add, changes. We add them to previous snapshot one by one

            # Find the addition of 2 snapshots of the orderbook
            def add_2_dicts(order_book_snapshot1, order_book_snapshot2):
                
                added_dict = order_book_snapshot1.copy()

                # Loop through the bids in the first order book snapshot
                for side in ['bids']:
                    for i in range(len(order_book_snapshot1[side])):
                        # Calculate the difference in quantity for this bid level between the two snapshots
                        added_dict[side][i]['quantity'] += order_book_snapshot2[side][i]['quantity']
                        # Calculate the difference in number of orders for this bid level between the two snapshots
                        added_dict[side][i]['orders'] = 1 #order_book_snapshot2[side][i]['orders']

                return added_dict
            replenishment_plus_algo_dict = {}
            replenishment_plus_algo_dict = add_2_dicts(replenishment_dict_to_add, algo_dict_to_add)
            # print(f"replenishment_plus_algo_dict: {replenishment_plus_algo_dict}\n")
            changes_plus_replenishment_plus_algo_dict = add_2_dicts(changes, replenishment_plus_algo_dict)
            # print(f"changes_plus_replenishment_plus_algo_dict: {changes_plus_replenishment_plus_algo_dict}\n")
            # snapshot = add_2_dicts(snapshot, changes_plus_replenishment_plus_algo_dict)
            # print(f"Next second snapshot by market data: {snapshot}\n")




        # Set the current snapshot as the previous snapshot for the next iteration
        previous_snapshot = snapshot


        c += 1
        # if c == 150:
            # print("Entered BREAK!!!!!!!!!!!!!!!!!!!!!!!!!\n!!!!!!!!!!!!!!!!!!!!!!!!!!\n")
            # break
        if c == 7200 - 1:
            m2_list, incomplete_replenishment_dict = extract_incomplete_replenishments(m2_list, incomplete_replenishment_dict)
            # trade_signal += trade_signal

        # else:
        #     print("No trade during 1st snapshot for period. Understand intuition...\n")
        


    tot_qty_liq = sum(total_qty_liquidated)
    # avg_price = [x for x in avg_price if x != 0]
    if avg_price == 0:
        avg_pr = 0
    # avg_pr = sum(avg_price) / len(avg_price)
    # print(f"Total Quantity Liquidated: {tot_qty_liq}")
    # print(f"Average Price: {avg_pr}")
    
    # r2 = r2_score(validation_data['delp'][1:], delp)
    r2 = 0
    return tot_qty_liq, avg_pr, r2


In [458]:
order_quantity = 100
replenishment_rates = [0.38548875866534427, 0.2968284478761956, 0.19869853940874252, 0.2935878038702436, 0.09307537261535587, 
                       0.1541179378052512, 0.13174772956517022, 0.054074532365372664, 0.15, 0.2]

tot_qty_liq, avg_pr, r2 = model_selection(validation_data, order_quantity, replenishment_rates, model_choice)

x:    netbuy  mean_rev_lag  level1ofi  level2ofi  level3ofi  level4ofi  \
0     0.0 -2.220446e-15     -148.0      -51.0      -50.0      -18.0   

   level5ofi  level6ofi  level7ofi  level8ofi  level9ofi  level10ofi  
0      -22.0       49.0       -2.0       -2.0       -2.0        50.0  
y:    delp
0   0.0

x: 1
y: 1

x:    netbuy  mean_rev_lag  level1ofi  level2ofi  level3ofi  level4ofi  \
0     0.0 -2.220446e-15      -72.0        0.0        2.0        4.0   

   level5ofi  level6ofi  level7ofi  level8ofi  level9ofi  level10ofi  
0       -1.0        0.0      -10.0        0.0        0.0         0.0  
y:    delp
0   0.0

x: 1
y: 1

x:    netbuy  mean_rev_lag  level1ofi  level2ofi  level3ofi  level4ofi  \
0     0.0 -2.220446e-15        0.0       -9.0      -10.0        0.0   

   level5ofi  level6ofi  level7ofi  level8ofi  level9ofi  level10ofi  
0        0.0        0.0       10.0        0.0        0.0         0.0  
y:    delp
0   0.0

x: 1
y: 1

x:    netbuy  mean_rev_lag  level1ofi  leve

UnboundLocalError: local variable 'avg_pr' referenced before assignment

In [256]:
# completed_replenishment_dict

In [369]:
print(f"Total Quantity Liquidated: {sum(total_qty_liquidated)}")
print(f"Average Price: {sum(avg_price) / len(avg_price)}")

Total Quantity Liquidated: 45060
Average Price: 132.1189074289836


In [371]:
print(f"Total Quantity Liquidated: {sum(total_qty_liquidated)}")
print(f"Average Price: {sum(avg_price) / len(avg_price)}")

Total Quantity Liquidated: 453500
Average Price: 132.11852604740903


In [373]:
print(f"Total Quantity Liquidated: {sum(total_qty_liquidated)}")
print(f"Average Price: {sum(avg_price) / len(avg_price)}")

Total Quantity Liquidated: 6067000
Average Price: 132.1352398219878


In [389]:
print(f"Total Quantity Liquidated: {sum(total_qty_liquidated)}")
print(f"Average Price: {sum(avg_price) / len(avg_price)}")

Total Quantity Liquidated: 2553000
Average Price: 131.97612490207598


In [408]:
print(f"Total Quantity Liquidated: {sum(total_qty_liquidated)}")
print(f"Average Price: {sum(avg_price) / len(avg_price)}")

Total Quantity Liquidated: 193000
Average Price: 131.93600226683938


In [409]:
r2 = r2_score(validation_data['delp'][1:], delp)
r2

-0.7846262463503082

In [255]:
total_trades

4506

In [94]:
# order_book_snapshot2

In [114]:
# order_book_snapshot2['asks'][0]['price']

In [115]:
# order_book_snapshot.update(order_book_snapshot2)

In [116]:
# order_book_snapshot

In [112]:
# order_book_snapshots = []
# order_book_snapshots.append(order_book_snapshot)
# order_book_snapshots.append(order_book_snapshot2)
# order_book_snapshots

In [142]:
# order_book_snapshots[0]

In [271]:
# order_book_snapshot2['asks'][0]['price']
# time_diff = datetime.strptime(order_book_snapshot2['transacttime'], '%H:%M:%S') - datetime.strptime(order_book_snapshot['transacttime'], '%H:%M:%S')
# time_diff.total_seconds()


#### Transcripts of meeting April 14 2023

make sure replenish function is removing the
example: if algo is buying, quantity is going to be removed from the sell side.\

Participation rate is how much the volume is being transacted every minute and taking 10%
This is a parameter we can use.
Vary parameter and get a plot

Participation is key
Higher rate of participation mens it inceases the price impact. If we participate quickly, we can get out of the market quickly.

We can use the avg order size we found
Because the market is reactionless, u never gonna know how much participation ratio is gonna impact the market and etc.

in the test harness, dissalowwing the algorithm in throwing everything in the bed at one go.

question:
is also allowed to wipe out more than 1 level
answer:
yes, that's what happens in reality
algo is making a decision to trade, based on net change, not based on absolute price.
we have to come up with a price

question:
we know how to arrive at quantity, how do we arrive at price?
answer:
version 1 of this can be made where it only does market orders

market indicators are an overlay on what the model is deciding
ideally u wanna have as many indicators to help algo make decision because the market exhibits diff characteristics at diff time of the day
each indicator extracts a certain characteristics
to start with, dont use the indicator
see the strength of the model
and then try overlaying these indicators on top to see how it performs

question:
model tells us delp, could be +ve or -ve, but it doesnt tell us if we need to trade or not. How do we decide?
answer:
if we have a positive position, and if the model predicts an increase in price, it makes sense to hold off
if we have a positive position, and if the model predicts an decrease in price, it makes sense to liquidate
exact vice versa for negative position


cost to trade
q from inder: how are u using cost to trade

the model is predicting a certain price change in the future in a certain amt of time, and based on my positon, i will use that price change as a guide to trade a certain amt of qty driven by the participation rate. we factored this into the model right?

basically liquidate it by splitting the order and get what is the price change and the vwap and we know how much we've spent/recieved.
if we sweep the book using this formula, that same 10k qty which is unlikely happen in a practical situation, we get the price for that and that is a very predictable price and we know how much we're going to incur.

We can compare these 2 and see which performs better by splitting the order rather than executing it in one stretch.
other than that, we dont see the cost to trade when we are executing.


ctt increases or decreases when we sweep the book on the respective sides
question:
do we incorporate ctt to decide?
answer:
no

ctt is changing with every market data update
to keep ur test really simple
just use the price change outcome with the accomodation of participation rate which will give order qty, we do mkt order
see how ur algo performs
how do we know pwerformance of algo
at the end

using the data, figure ut a price x quantity and come up woth a PnL number

We start with a PnL of zero. and we compare it with the PnL of different versions of the model. In reality we need to build a position first and then see the performance. But a simple approach would be to start with a PnL of zero. Version 2 3 could build the position.


replenish function

how do we come up with 120 seconds to replenish?

walk thru the data we have and see how long it takes to replenish.,
what is the avg time it takes to replenish. if it never replenishes, dont replenish it.
if we didnt replenish, our algo will eventually eat up the orderbook, that's why we want the replenish function

thresholds will be determined once and we use that same value.
keep it simple.
we want a deterministic harness. so try to keep it as simple and linear as possible
our goal is to test the model, be concisous about the determinism of the harness

q: if we keep replenishing it based on the threshold and fill rates, wont the algo go and continously remove that loquidity off the book?
a: yes the idea is that we want to replenish it, but we dont want to do it immediately too, because in real life, we have liquidity providers who will provide liquidity eventually. latest week is 68 million txns. even when there is such a high volume, there are people providign liquidity. in the abscense of any agents acting as liquidity providers, we need to somehow procide this to the model.

no open orders, we only do market orders

one time excersice to find participation rate
go thru whole data and find a scalar number
it is fixed throughout
it is deterministic

If the resolution of algorithm decision is in seconds, keep ur participation rate in the same resolution

how changing hyper parameters affected the performance of the model
One is the participation rate.
This is valid when we liquidate 10 to 20x top of the book.
q what is thelarge order qty
we can get the avg top of the book liqiodity fromthe cme website. we can take an avg of the level1 qty from that tool

other graph i would add is maybe do a graph of how the rf do vs the models we tried earlier.
2 3 harnesses will be interesting for the consumers and for us


In [346]:
# create_replenish_keys function helps us add a few keys to the output_from_algo_copy dictionary. The keys will be used in the replenishment function.
def create_replenish_keys(output_from_algo_copy, replenishment_frequency, current_time):
    
    # We flip signs
    keys = ['bids','asks']

    for key in keys:
        for i in range(10):
            # output_from_algo_copy[key][i]['balance_to_replenish'] = (-1)*trade_dict[0][key][i]['quantity']
            
            # output_from_algo_copy[key][i]['price'] = 0 
            output_from_algo_copy[key][i]['orders'] = 0 

            # We divide by replenishment frequency. This value is basically the quantity that will be replenished for replenishment_frequency number of times.
            output_from_algo_copy[key][i]['replenishment_size'] = ((-1)*output_from_algo_copy[key][i]['quantity'])/replenishment_frequency

            # Keep track of how many replenishments has been done. Initially count = 0
            output_from_algo_copy[key][i]['replenishment_count'] = 0

            # Max replenishment count
            output_from_algo_copy[key][i]['max_replenishment_count'] = replenishment_frequency
            # This can be the same for the whole trade, or it can be different for different levels on the orderbook.

            # Seconds since trade happened. 0 because trade just happened
            output_from_algo_copy['seconds_since_txn'] = 0 #time_diff.total_seconds()

    return output_from_algo_copy

In [347]:
# t = create_replenish_keys(order_book_snapshot2, replenishment_frequency, current_time)
# t

In [364]:
def final_dict_replenish_for_that_second(trade_dict_replenish, snapshot, current_time):

    snapshot_replenish = snapshot.copy()
    # snapshot_replenish
    keys = ['bids','asks']

    for key in keys:
        for i in range(10):
            # output_from_algo_copy[key][i]['balance_to_replenish'] = (-1)*trade_dict[0][key][i]['quantity']
            
            snapshot_replenish[key][i]['price'] = 0 
            snapshot_replenish[key][i]['quantity'] = 0
            # print (snapshot_replenish[key][i]['quantity'])
            snapshot_replenish[key][i]['orders'] = 0 

    # we got emtpy changes dict to be added 
    # go to first dict
    # check if seconds is 120
    # if 120, go inside
    # if rep_count <= max_rep_count
        # for loop
        # add the dict[key][i][quantity] += snapshot_replenish[key][i]['quantity']
        # increase the rep_count by 1 for all levels
    # Set seconds_since_txn as current time - transacttime
    # total_replenishments_remaining
    # Iterate through all dicts inside trade_dict_replenish
    for i in range(len(trade_dict_replenish)):

        temp_dict = trade_dict_replenish[i].copy()
        # print(snapshot_replenish['bids'])
        # If it has been 120 seconds since trade or last replenish
        if temp_dict['seconds_since_txn']%120 == 0:
            
            for key in range(len(keys)):
                
                for level in range(len(temp_dict[keys[key]])):
                    # If replenishment count is not yet max replenishment count at that key and level
                    # print(temp_dict[key][level]['replenishment_count'])
                    # print(key)
                    if temp_dict[keys[key]][level]['replenishment_count']<=temp_dict[keys[key]][level]['max_replenishment_count']:
                        # Increase count of replenishment by 1
                        temp_dict[keys[key]][level]['replenishment_count'] += 1
                        
                        # Iterate through every level in bid and ask to take the replenishment quantity
                        snapshot_replenish[keys[key]][level]['quantity'] += temp_dict[keys[key]][level]['replenishment_size']
                        
            # Find out how many replenishments are left ? we can further increase the complexity by allowing different levels to replenish at different times
            replenishments_remaining = temp_dict['bids'][0]['max_replenishment_count'] - temp_dict['bids'][0]['max_replenishment_count']
            if replenishments_remaining==0:
                trade_dict_replenish = trade_dict_replenish[1:]
                i -= 1
                continue

            # for key in keys:
            #     for i in range(10):
                    # Add the quantity to a dict
                        # snapshot_replenish[key][i]['quantity'] += snapshot_replenish[key][i][key]['replenishment_size']
    
        # Intuitively, this is going to increase this value by 1 every time current_time increases by a second
        temp_dict['seconds_since_txn'] = (current_time - datetime.strptime(temp_dict['transacttime'], '%H:%M:%S')).total_seconds()
    
    return snapshot_replenish, trade_dict_replenish

In [365]:
current_time = '17:00:01'

# x = final_dict_replenish_for_that_second(trade_dict_replenish, snapshot, current_time)

xc = order_book_snapshot.copy()
xc['trade']=1

# trade_dict_replenish = []


sample_output_from_algo = {'bids': [{'price': 0.0, 'quantity': 13.0, 'orders': 0},
                        {'price': 0.0, 'quantity': 73.0, 'orders': 1},
                        {'price': 0.0, 'quantity': 75.0, 'orders': 1},
                        {'price': 0.0, 'quantity': 55.0, 'orders': 1},
                        {'price': 0.0, 'quantity': 54.0, 'orders': 1},
                        {'price': 0.0, 'quantity': 23.0, 'orders': 1},
                        {'price': 0.0, 'quantity': 33.0, 'orders': 1},
                        {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
                        {'price': 0.0, 'quantity': 0.0, 'orders': 0.0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0}],
                      'asks': [{'price': 0.0, 'quantity': -137.0, 'orders': -2.0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0.0},
                        {'price': 0.0, 'quantity': 0.0, 'orders': 0.0}],
                      'transacttime': '17:00:01',
                      'trade':1}


trade_dict_replenish = [{'bids': [{'price': 0.0, 'quantity': 13.0, 'orders': 0, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':2},
                        {'price': 0.0, 'quantity': 73.0, 'orders': 1, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':5},
                        {'price': 0.0, 'quantity': 75.0, 'orders': 1, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':6},
                        {'price': 0.0, 'quantity': 55.0, 'orders': 1, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':2},
                        {'price': 0.0, 'quantity': 54.0, 'orders': 1, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':25},
                        {'price': 0.0, 'quantity': 23.0, 'orders': 1, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':1},
                        {'price': 0.0, 'quantity': 33.0, 'orders': 1, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':8},
                        {'price': 0.0, 'quantity': 0.0, 'orders': 0.0, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':0},
                        {'price': 0.0, 'quantity': 0.0, 'orders': 0.0, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':0}],
                      'asks': [{'price': 0.0, 'quantity': -137.0, 'orders': -2.0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':0},
                        {'price': 0.0, 'quantity': 0, 'orders': 0.0, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':0},
                        {'price': 0.0, 'quantity': 0.0, 'orders': 0.0, 'replenishment_count':1, 'max_replenishment_count':4, 'replenishment_size':0}],
                      'transacttime': '17:00:01',
                      'seconds_since_txn': 120,
                      }]



# d = replenish(order_book_snapshot, xc, trade_dict_replenish)
# d
snapshot_traded_replenished, trade_dict_replenish = replenish(order_book_snapshot, sample_output_from_algo, trade_dict_replenish)


KeyError: 'replenishment_count'

In [350]:
snapshot_traded_replenished

{'bids': [{'price': 0, 'quantity': -6.5, 'orders': 0},
  {'price': 0, 'quantity': -36.5, 'orders': 0},
  {'price': 0, 'quantity': -37.5, 'orders': 0},
  {'price': 0, 'quantity': -27.5, 'orders': 0},
  {'price': 0, 'quantity': -27.0, 'orders': 0},
  {'price': 0, 'quantity': -11.5, 'orders': 0},
  {'price': 0, 'quantity': -16.5, 'orders': 0},
  {'price': 0, 'quantity': 0.0, 'orders': 0},
  {'price': 0, 'quantity': 0.0, 'orders': 0},
  {'price': 0, 'quantity': 0.0, 'orders': 0}],
 'asks': [{'price': 0, 'quantity': 68.5, 'orders': 0},
  {'price': 0, 'quantity': 0.0, 'orders': 0},
  {'price': 0, 'quantity': 0.0, 'orders': 0},
  {'price': 0, 'quantity': 0.0, 'orders': 0},
  {'price': 0, 'quantity': 0.0, 'orders': 0},
  {'price': 0, 'quantity': 0.0, 'orders': 0},
  {'price': 0, 'quantity': 0.0, 'orders': 0},
  {'price': 0, 'quantity': 0.0, 'orders': 0},
  {'price': 0, 'quantity': 0.0, 'orders': 0},
  {'price': 0, 'quantity': 0.0, 'orders': 0}],
 'start_time': '17:00:01',
 'end_time': '17:00:0

In [None]:
snapshot_replenish['bids'][i]['quantity']

In [309]:
order_book_snapshots = []
trade_dict = []
# trade_dict_replenish = []

def replenish(snapshot, output_from_algo, trade_dict_replenish):
    
    # Get current time in a datetime format
    current_time =  datetime.strptime(snapshot['transacttime'], '%H:%M:%S')
    
    # logger(output_from_algo)
    
    # call logger function
    # keeps track of transacttime and quantity
    # create a dict order_book_snapshot['transacttime']
    # order_book_snapshot2['asks'][0]['price']


    # If a trade happens, add that trade to the list of trades in the trade_dict.
    if output_from_algo['trade']==1:
        # This is basically a log of all the changes the algo makes to the orderbook
        trade_dict.append(output_from_algo)

        temp = output_from_algo.copy()
        # Create new keys to be used during replenishing
        temp = create_replenish_keys(temp, replenishment_frequency, current_time)
        # Append the latest modified trade dict to the master dict containing all the trade data that needs to be replenished
        trade_dict_replenish.append(temp)

    # replenish_dict is the final dict that needs to be added to the snapshot.
    # Trade_dict_replenish contains the replenishment information for past trades.
    replenish_dict, trade_dict_replenish = final_dict_replenish_for_that_second(trade_dict_replenish, snapshot, current_time)
    

    # take the output from the function and add it to the snapshot to get the traded, replenished snapshot
    # snapshot_traded_replenished = find_order_book_addition(snapshot, replenish_dict)

    # return snapshot_traded_replenished, trade_dict_replenish
    return replenish_dict, trade_dict_replenish
    # return trade_dict_replenish



In [233]:
# The current structure of this loop is not final and is going to change as we progress. We might have to call functions in a different manner.

# Initialize a variable to hold the previous snapshot
previous_snapshot = None
mid_prices = []

snapshot_gen = order_book_snapshot_generator(sliced_data2, 10)

# Iterate over the snapshots and find the changes
for snapshot in snapshot_gen:
    
    # Send snapshot to algorithm. (Call algo function)
    # Call the Algorithm
    # _ = midprice(snapshot)
        # Participation ratio limits the number of orders an algorithm places in a minute.
        # Cost to trade will be determined by the current snapshot
        # Boundary conditions
        # Extreme conditions (sudden drop in orders or qty indicating drastic price change)
        # What else decides?
        # 
        # Return a dict saying Yes/No, Qty, Min Price allowed to go to to liquidate)

        # quantity_list = [2,3,4,5,6,10] # dynamic list genrated by algorithm

        # for quanity_amount in quantity_list:
        #     CTT = cost to trade(snapshot, quanity_amount)
        #     CTT.append

        # CTT = [qt123, 548, 6564, 546, 54]

        # CTT.min()

        # Decision
        

        # dict = {'Trade': 0/1, 'transacttime': '17:00:01', 'quantity': 9717, 'min_price': 081.145, 'trade_number':1}
        # return dict
    
    # If this is not the first snapshot, find the changes
    if previous_snapshot is not None:
        changes = find_order_book_changes(previous_snapshot, snapshot)
        # print(changes)
    

    


    # Output of the algorithm is going to be taken into a logger which will log the algorithm's order if any.
    # Logger will note down algorithm's trade order
    # Logger will call Is_order_valid_function which will check if the order is valid and can be executed.
    # If it can be executed, it logs the trade price and quantities.
    # # if it cannot be executed, it logs the no fill msg
    # # Increase the quantity in the orderbook by 1
    snapshot_traded_replenished, trade_dict_replenish = replenish(snapshot, output_from_algo)

    
    # Add the output dict values from the Order_qty_reduction_function to our market orderbook changes dict
    # Use this new net change and add it to the previous snapshot, and get the new snapshot of the data.
    # This new snapshot will become the previous snapshot and the loop will continue
    snapshot = find_order_book_addition(snapshot_traded_replenished, changes)


    # Set the current snapshot as the previous snapshot for the next iteration
    previous_snapshot = snapshot
    
    print(snapshot)

# liquidated quantity, avg price = pnl_calculator(trade_dict_replenish)

NameError: name 'output_from_algo' is not defined

KeyError: 'bids'

IndexError: list index out of range

In [None]:
# import threading
# import queue

# # Define an Order class to represent an order
# class Order:
#     def __init__(self, side, price, quantity):
#         self.side = side
#         self.price = price
#         self.quantity = quantity

# # Define an OrderBook class to represent the order matching engine
# class OrderBook:
#     def __init__(self):
#         self.bids = []
#         self.asks = []

#     # Define a method to add an order to the order book
#     def add_order(self, order):
#         if order.side == 'BUY':
#             self.bids.append(order)
#             self.bids.sort(key=lambda x: x.price, reverse=True)
#         elif order.side == 'SELL':
#             self.asks.append(order)
#             self.asks.sort(key=lambda x: x.price)

#         # Match orders
#         while len(self.bids) > 0 and len(self.asks) > 0 and self.bids[0].price >= self.asks[0].price:
#             bid = self.bids[0]
#             ask = self.asks[0]
#             if bid.quantity > ask.quantity:
#                 fill = Order('FILL', ask.price, ask.quantity)
#                 self.bids[0].quantity -= ask.quantity
#                 self.asks.pop(0)
#             elif bid.quantity < ask.quantity:
#                 fill = Order('FILL', bid.price, bid.quantity)
#                 self.asks[0].quantity -= bid.quantity
#                 self.bids.pop(0)
#             else:
#                 fill = Order('FILL', bid.price, bid.quantity)
#                 self.bids.pop(0)
#                 self.asks.pop(0)
#             # Print fills
#             print(fill.side, fill.price, fill.quantity)

# # Define an Algorithm class to represent the algorithm
# class Algorithm:
#     def __init__(self, orderbook):
#         self.orderbook = orderbook
#         self.orders = queue.Queue()

#     # Define a method to submit an order to the algorithm
#     def submit_order(self, order):
#         self.orders.put(order)

#     # Define a method to process orders
#     def process_orders(self):
#         while not self.orders.empty():
#             order = self.orders.get()
#             self.orderbook.add_order(order)

# # Create an OrderBook instance and an Algorithm instance
# orderbook = OrderBook()
# algorithm = Algorithm(orderbook)

# # Submit some orders to the algorithm
# algorithm.submit_order(Order('BUY', 100, 10))
# algorithm.submit_order(Order('SELL', 90, 5))
# algorithm.submit_order(Order('SELL', 110, 15))
# algorithm.submit_order(Order('BUY', 120, 7))

# # Process the orders
# algorithm.process_orders()


In [None]:
<!-- import math
def divide_quantity_by_orders(t5):
    order_book = t5.copy()
    for side in ['bids', 'asks']:
        for level in order_book[side]:
            if level['quantity'] != 0:
                if level['orders'] != 0:
                    if level['quantity'] < 0 and level['orders'] < 0:
                        # making sure the qty stays negative because it becomes positive when dividing by a negative number
                        quotient = level['quantity'] / ((-1)*level['orders'])
                    elif level['quantity'] >0 and level['orders'] >0:
                        quotient = level['quantity'] / level['orders']
                    elif level['quantity'] >0 and level['orders'] <0:
                        quotient = level['quantity'] / ((-1)*level['orders'])
                    elif level['quantity']<0 and level['orders']>0:
                        quotient = level['quantity'] / level['orders']
                            
                    if math.isfinite(quotient) and not math.isnan(quotient):
                        level['quantity'] = round(quotient)
                    else:
                        level['quantity'] = 0
                    level['orders'] = 1
                elif level['orders']== 0:
                    level['quantity'] = 0
                    level['orders'] = 0
    del order_book['asks'] # we do this because we are just modelling replenishment on the bid side of the book
    return order_book

def check_negative_quantity_add_keys(t6):
    order_book = t6.copy()
    for side in ['bids']:
        for level in order_book[side]:
            if level['quantity'] < 0:
                level.setdefault('0', 0)
    return order_book

def check_negative_quantity_append_list(order_book, m2_list):
    m1_list = []
    c = 0
    for side in ['bids']:
        for level in order_book[side]:
            if level['quantity'] < 0:
                c += 1
                # m1_list.append(order_book)
                # m2_list.append(m1_list)
    
    if c>0:
        m1_list.append(order_book)
        m2_list.append(m1_list)
    return m2_list

# def wrap_with_list(lst):
#     return [[d] for d in lst]

def get_indexes_of_lists(temp_changes, m2_list)
    indexes = []
    # If the negative change happened in the past
    for i in range(len(m2_list)):
        if m2_list[i]['end_time'] < ob_changes['end_time']:
             indexes.append(i)
    return indexes
         
def positive_values_in_dict(dict):
    indexes = []
    count = 0
    # Counts how many positive values there are in the next changes
    for side in ['bids']:
        for level in dict[side]:
            if level['quantity'] > 0:
                # indexes has the indexes(level) of all the positive quantity changes
                indexes.append(count)
                count += 1
    
    return indexes, count

def negative_values_in_dict(dict):
    indexes = []
    count = 0
    # Counts how many negative values there are in the next changes
    for side in ['bids']:
        for level in dict[side]:
            if level['quantity'] < 0:
                # indexes has the indexes(level) of all the positive quantity changes
                indexes.append(count)
                count += 1
    
    return indexes, count

def check_common_index(list1, list2):
    common_index = []
    for item in list1:
        if item in list2:
            common_index.append(item)
    return common_index

def check_for_fully_replenished_list(m2_list, m3_list):
    
    # iterate thru m2_list
    for m1_list in m2_list:
        # Get indexes and total count of negative quantity levels in that dict
        indexes_neg_temp, count = negative_values_in_dict(m1_list[-1])
        sum_total_qty = 0
        for side in ['bids']:
            for level in indexes_neg_temp:
                # Here we can check in 2 ways.
                # 1. Check for '100' key in the dict
                # 2. For all keys, sum of all values == 0 ?
                
                # We use the second approach
                # total_qty range (-inf, 0].
                # 0 means fully replenished.
                total_qty = 0
                for key in m1_list[-1][side][level]:
                    total_qty += m1_list[-1][side][level][key]
                    # Cumulative sum of all the negative quantity dict values
                    sum_total_qty += total_qty

        # If this is 0, it means all negative quantities has been fully replenished
        if sum_total_qty == 0:
            # Extract that m1_list and add it to m3_list.
            m3_list.append(m1_list)

            # Make sure to delete that m1_list from m2_list
            m2_list.remove(m1_list)

    return m2_list, m3_list

def replenish(m2_list, ob_changes):
    # c2 = 0
    # c3 = []
    # ob changes positive value at any level?
    
    # Keep a original copy because iwe might change it inside the loop
    ob_changes_original_copy = ob_changes.copy()

    # index means level. we say index because it is the index of the list
    indexes_pos, count_pos = positive_values_in_dict(ob_changes_original_copy)

    # if there is some positive quantity in the new changes, create temp_changes which is a copy of ob_changes.
    # temp_changes will be used when there is excess quantity after replenishment
    if count_pos >0:
        # extract the positive changes and store them in a dict called temp_changes. 
        # This is the quantity reduced by a single trade at that level at that second
        # As we replenish, temp_changes will have the respective quantites at each level reduced in a specific manner.
        temp_changes = ob_changes_original_copy.copy()
        # We just keep the positive quantities and make everything else 0
        for side in ['bids']:
            for level in temp_changes[side]:
                if level['quantity'] <= 0:
                    temp_changes[side][level]['quantity'] = 0
                    temp_changes[side][level]['orders'] = 0

    indexes_neg_list = []
    count_neg_list = []
    common_indexes_dict = {}
    common_indexes_list = []
    m1_list_index = []

    # We extract a list of negative indexes for every m2_list[i]
    for i in range(len(m2_list)):# indexes:     
        indexes_neg, count_neg = negative_values_in_dict(m2_list[i][0]) # we check negative quantity in first dict in m1_list because all the negative quantity will remain at the same level
        indexes_neg_list.append(indexes_neg)
        count_neg_list.append(count_neg)
        m1_list_index.append(i)
        
        # If there is some positive change in the new changes
        if count_pos > 0:
            # if negative indexes share any values with positive indexes, get that into a common_index
            common_indexes = check_common_index(indexes_neg,indexes_pos)
            # If there are some common indexes,
            if len(common_indexes)>0:
                # Assign the indexes and m1_list index number in m2_list as values to keys in a dict
                common_indexes_dict['common_index'] = common_indexes
                common_indexes_dict['m1_index_in_m2'] = i

                # common_indexes_dict helps us access indexes of
                # 1. m1_list in m2_list
                # 2. the levels which satisfy our checking criteria
                # Because of this, we know exactly which level and trade the positive quantity can replenish if it had large enough quantity
                # In reality, all these levels in all trades of m2_list might not get replenished during the current second.
                # The whole list might be empty if the new positive quantities dont share any common levels with
                common_indexes_list.append(common_indexes_dict)
        
    # If there are common indexes,
    if len(common_indexes_list) > 0:
        for common_index_dict in common_indexes_list:

            if counter_for_excess_qty > 0:
                ob_changes_original_copy = temp_changes.copy()    
                # We do this because when we have excess quantity, we want the next common set of indexes to use the balance quantity after first replenishment
                # We keep reducing the qty everytime we replenish, so that it keeps track of what's left
                # sometimes, the common_index_dict might get over even if there is excess quantity.
                # In that situation, the for loop will get over before all the positive quantity changes are exhausted
            
            m1_index = common_index_dict['m1_index_in_m2']
            # Make a copy because we will add new keys to this dictionary and append it to m1_list to keep track of when it was replenished also.
            temp_m1_sublist = m2_list[m1_index][-1].copy()
            
            # You'll understand purpose of this counter when u see where it is being used below
            counter_for_excess_qty = 0

            for side in ['bids']:
                for level in common_index_dict['common_index']:
                    # Summing all the values for all keys at that level
                    # total_qty bw (-inf, 0].
                    # 0 means fully replenished. We will be exporting the replenished list to antoher dict, so we wont come to that condition or requirement
                    total_qty = 0
                    for key in m2_list[m1_index][-1][side][level]:
                        total_qty += m2_list[m1_index][-1][side][level][key]
                    
                    # Intuitively undestand boundaries
                    not_fill_percentage_before_replenishment = total_qty / m2_list[m1_index][-1][side][level]['quantity']
                    fill_percentage_before_replenishment = 1 - not_fill_percentage_before_replenishment
                    quantity_before_replenishment = fill_percentage_before_replenishment*(-1)*(m2_list[m1_index][-1][side][level]['quantity'])
                    
                    # if fully replenished
                    if not_fill_percentage_before_replenishment == 0 :
                        # go to next level because the quantity at this level is fully replenished
                        continue

                    # if not fully replenished
                    if total_qty < 0:
                        
                        # this is the balance/excess of that level after replenishment so can be negative, 0 or positive.
                        quantity_after_replenishment = quantity_before_replenishment + ob_changes_original_copy[side][level]['quantity']
                        replenished_quantity = quantity_after_replenishment - quantity_before_replenishment
                        fill_percentage_after_replenishment = quantity_after_replenishment/ m2_list[m1_index][-1][side][level]['quantity']
                        
                        if fill_percentage_after_replenishment < 1.: # it means we need more to to replenish, so we just create a new key with the replenishment qty and % details    
                            
                            # Make the quantity 0 because we have replenished using all the quantity from the ob_changes
                            temp_changes[side][level] = 0

                            # Mark the replenishment time
                            # temp_m1_sublist['replenishment_time'] = ob_changes['end_time']
                            
                            # Creating a new key with the the 2 values we found. Key = 'percentage_replenished', value = new_qty_replenished
                            temp_m1_sublist[side][level][str(fill_percentage_after_replenishment)] = quantity_after_replenishment

                            # we go to the next level
                            continue

                        elif fill_percentage_after_replenishment == 1: 
                        # it means quantity replenished by new change perfectly replenishes the negative quantity
                        # key word: perfectly

                            # Make the quantity 0 because we have replenished using all the quantity from the ob_changes
                            temp_changes[side][level] = 0

                            # Creating a new key with the cumulative_replenished_qty 100% at that level
                            temp_m1_sublist[side][level]['100'] = quantity_after_replenishment

                            # we go to the next level
                            continue

                            # we add the time of ob_changes as a key to temp itself because we want to know when it got replenished
                            # temp_m1_sublist['time_of_replenishment'] = ob_changes['end_time']

                            # We append the new temp to the top of the m2_list[i].
                            # m2_list[i].insert(0, temp)
                        
                        elif fill_percentage_after_replenishment > 1: # it means we have excess quantity after filling that level in the previous snapshot

                            counter_for_excess_qty += 1

                            replenished_quantity = temp_m1_sublist[side][level]['quantity'] - quantity_before_replenishment
                            
                            # Reduce the quantity in temp_changes by replenished_quantity
                            temp_changes[side][level] = temp_changes[side][level] - replenished_quantity

                            # Creating a new key with the cumulative_replenished_qty 100% at that level
                            temp_m1_sublist[side][level]['100'] = (-1)*temp_m1_sublist[side][level]['quantity']     


                            # excess qty and updated m2_list(with new temp added) is passed into a function.
                            # we add the time of ob_changes as a key to temp itself because we want to know when it got replenished
                            # temp_m1_sublist['time_of_replenishment'] = ob_changes['end_time']

                            # recursive call???
                            # or just manually do it. it is not that complex to do recursion, just few loops only before it is fully replenished.

            # Now all the level's have been iterated over and replenishment done wherever applicable for the common_index_dict[i]

            # Mark the replenishment time
            temp_m1_sublist['replenishment_time'] = ob_changes_original_copy['end_time']

            # We append the new temp_m1_sublist to m2_list[m1_index]
            # After we have completed our replenishment, 
            m2_list[m1_index].append(temp_m1_sublist)

            # m3_list contains the fully replenished lists. it will be cut from m2_list and pasted to m3_list.
            # this way, m2_list only contains the lists than need to be replenished
            m2_list, m3_list = check_for_fully_replenished_list(m2_list, m3_list)

            # We don't need this because we are not replenshing anything
            # replenishment_dict_to_be_added_to_snapshot = replenish_dict_calculator(ob_changes, temp_changes)

    return m2_list, m3_list -->
