In [1]:
import pandas as pd

def calculate_time_since_start(df):
    # Assuming 'time' column is in datetime format
    start_time = df['time'].iloc[0]
    df['time_since_start'] = (df['time'] - start_time).dt.total_seconds() * 1000
    

def split_range_into_categories(min_val, max_val, num_categories=10):
    # Calculate the range
    range_size = max_val - min_val
    
    # Calculate the size of each category
    category_size = range_size / num_categories
    
    # Initialize a list to store the category boundaries
    categories = []
    
    # Iterate to determine the boundaries of each category
    for i in range(num_categories):
        category_min = min_val + i * category_size
        category_max = min_val + (i + 1) * category_size
        categories.append((category_min, category_max))
    
    # Ensure that the last category ends exactly at max_val
    categories[-1] = (categories[-1][0], max_val)

    return categories
    

def find_category_index(value, categories):
    for i, (category_min, category_max) in enumerate(categories):
        if category_min <= value <= category_max:
            return i
    return None  # If value is not within any category


def calculate_time_since_last_event(df):
    # Initialize variables to store last event time and mark
    last_event_time = None
    last_event_mark = None
    
    min_percentage_change = min(df['pctc'])
    max_percentage_change = max(df['pctc'])
    
    categories = split_range_into_categories(min_percentage_change, max_percentage_change)
    
    
    # Iterate through each row
    time_since_last_event = []
    for _, row in df.iterrows():
        # Check if the mark has changed since the last event
        # if row['close'] > row['open']:  # Assuming 1 indicates close price higher than open price
        #     mark = 1
        # else:
        #     mark = 0
        mark = find_category_index(row['pctc'], categories)

        # If it's the first event or the mark has changed, update the last event time and mark
        if last_event_time is None or mark != last_event_mark:
            last_event_time = row['time']
            last_event_mark = mark
            time_since_last_event.append(0)
        else:
            # Calculate the time difference since the last event with the same mark
            time_diff = (row['time'] - last_event_time).total_seconds() * 1000
            time_since_last_event.append(time_diff)

    # Add the 'time_since_last_event' column to the DataFrame
    df['time_since_last_event'] = time_since_last_event


In [17]:
df = pd.read_csv('NVDA.csv')
df['time'] = pd.to_datetime(df['time'])
df['pctc'] = df['close'].pct_change()
df = df.fillna(0)
df.head()

Unnamed: 0,time,open,high,low,close,Volume,Volume MA,pctc
0,2019-05-01 13:30:00+00:00,182.89,184.5,181.71,182.08,87009.0,58317.8,0.0
1,2019-05-01 14:30:00+00:00,182.25,183.31,181.39,181.48,32466.0,54655.85,-0.003295
2,2019-05-01 15:30:00+00:00,181.28,183.5,181.06,183.02,41981.0,53202.35,0.008486
3,2019-05-01 16:30:00+00:00,182.94,183.11,182.1,182.86,24737.0,51067.2,-0.000874
4,2019-05-01 17:30:00+00:00,182.88,184.77,182.77,184.71,33681.0,49363.5,0.010117


In [18]:
# Marking events as 1 if close price is higher than open price, else 0
min_percentage_change = min(df['pctc'])
max_percentage_change = max(df['pctc'])
categories = split_range_into_categories(min_percentage_change, max_percentage_change)
df['mark'] = df['pctc'].apply(lambda x: find_category_index(x, categories))

In [19]:
df

Unnamed: 0,time,open,high,low,close,Volume,Volume MA,pctc,mark
0,2019-05-01 13:30:00+00:00,182.890,184.50,181.71,182.08,87009.0,58317.80,0.000000,4
1,2019-05-01 14:30:00+00:00,182.250,183.31,181.39,181.48,32466.0,54655.85,-0.003295,4
2,2019-05-01 15:30:00+00:00,181.280,183.50,181.06,183.02,41981.0,53202.35,0.008486,4
3,2019-05-01 16:30:00+00:00,182.940,183.11,182.10,182.86,24737.0,51067.20,-0.000874,4
4,2019-05-01 17:30:00+00:00,182.880,184.77,182.77,184.71,33681.0,49363.50,0.010117,4
...,...,...,...,...,...,...,...,...,...
3630,2021-05-24 15:30:00+00:00,625.290,626.64,623.82,625.50,11932.0,40323.50,0.001120,4
3631,2021-05-24 16:30:00+00:00,626.195,627.67,625.04,627.67,11776.0,40224.95,0.003469,4
3632,2021-05-24 17:30:00+00:00,627.620,629.64,627.42,628.35,20193.0,40390.30,0.001083,4
3633,2021-05-24 18:30:00+00:00,628.860,629.51,626.32,626.48,9739.0,39795.35,-0.002976,4


In [20]:
# Calculating time since starting time in seconds
starting_time = df['time'].iloc[0]
df['time_since_start'] = (df['time'] - starting_time)

In [21]:
df

Unnamed: 0,time,open,high,low,close,Volume,Volume MA,pctc,mark,time_since_start
0,2019-05-01 13:30:00+00:00,182.890,184.50,181.71,182.08,87009.0,58317.80,0.000000,4,0 days 00:00:00
1,2019-05-01 14:30:00+00:00,182.250,183.31,181.39,181.48,32466.0,54655.85,-0.003295,4,0 days 01:00:00
2,2019-05-01 15:30:00+00:00,181.280,183.50,181.06,183.02,41981.0,53202.35,0.008486,4,0 days 02:00:00
3,2019-05-01 16:30:00+00:00,182.940,183.11,182.10,182.86,24737.0,51067.20,-0.000874,4,0 days 03:00:00
4,2019-05-01 17:30:00+00:00,182.880,184.77,182.77,184.71,33681.0,49363.50,0.010117,4,0 days 04:00:00
...,...,...,...,...,...,...,...,...,...,...
3630,2021-05-24 15:30:00+00:00,625.290,626.64,623.82,625.50,11932.0,40323.50,0.001120,4,754 days 02:00:00
3631,2021-05-24 16:30:00+00:00,626.195,627.67,625.04,627.67,11776.0,40224.95,0.003469,4,754 days 03:00:00
3632,2021-05-24 17:30:00+00:00,627.620,629.64,627.42,628.35,20193.0,40390.30,0.001083,4,754 days 04:00:00
3633,2021-05-24 18:30:00+00:00,628.860,629.51,626.32,626.48,9739.0,39795.35,-0.002976,4,754 days 05:00:00


In [22]:
# Convert time_since_start column to timedelta datatype
df['time_since_start'] = pd.to_timedelta(df['time_since_start'])

# Convert timedelta to seconds
df['time_since_start_seconds'] = df['time_since_start'].dt.total_seconds() / 60

In [23]:
df

Unnamed: 0,time,open,high,low,close,Volume,Volume MA,pctc,mark,time_since_start,time_since_start_seconds
0,2019-05-01 13:30:00+00:00,182.890,184.50,181.71,182.08,87009.0,58317.80,0.000000,4,0 days 00:00:00,0.0
1,2019-05-01 14:30:00+00:00,182.250,183.31,181.39,181.48,32466.0,54655.85,-0.003295,4,0 days 01:00:00,60.0
2,2019-05-01 15:30:00+00:00,181.280,183.50,181.06,183.02,41981.0,53202.35,0.008486,4,0 days 02:00:00,120.0
3,2019-05-01 16:30:00+00:00,182.940,183.11,182.10,182.86,24737.0,51067.20,-0.000874,4,0 days 03:00:00,180.0
4,2019-05-01 17:30:00+00:00,182.880,184.77,182.77,184.71,33681.0,49363.50,0.010117,4,0 days 04:00:00,240.0
...,...,...,...,...,...,...,...,...,...,...,...
3630,2021-05-24 15:30:00+00:00,625.290,626.64,623.82,625.50,11932.0,40323.50,0.001120,4,754 days 02:00:00,1085880.0
3631,2021-05-24 16:30:00+00:00,626.195,627.67,625.04,627.67,11776.0,40224.95,0.003469,4,754 days 03:00:00,1085940.0
3632,2021-05-24 17:30:00+00:00,627.620,629.64,627.42,628.35,20193.0,40390.30,0.001083,4,754 days 04:00:00,1086000.0
3633,2021-05-24 18:30:00+00:00,628.860,629.51,626.32,626.48,9739.0,39795.35,-0.002976,4,754 days 05:00:00,1086060.0


In [24]:
df['time_since_last_event'] = df.groupby('mark')['time_since_start'].diff().fillna(0)
df

Unnamed: 0,time,open,high,low,close,Volume,Volume MA,pctc,mark,time_since_start,time_since_start_seconds,time_since_last_event
0,2019-05-01 13:30:00+00:00,182.890,184.50,181.71,182.08,87009.0,58317.80,0.000000,4,0 days 00:00:00,0.0,0
1,2019-05-01 14:30:00+00:00,182.250,183.31,181.39,181.48,32466.0,54655.85,-0.003295,4,0 days 01:00:00,60.0,0 days 01:00:00
2,2019-05-01 15:30:00+00:00,181.280,183.50,181.06,183.02,41981.0,53202.35,0.008486,4,0 days 02:00:00,120.0,0 days 01:00:00
3,2019-05-01 16:30:00+00:00,182.940,183.11,182.10,182.86,24737.0,51067.20,-0.000874,4,0 days 03:00:00,180.0,0 days 01:00:00
4,2019-05-01 17:30:00+00:00,182.880,184.77,182.77,184.71,33681.0,49363.50,0.010117,4,0 days 04:00:00,240.0,0 days 01:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
3630,2021-05-24 15:30:00+00:00,625.290,626.64,623.82,625.50,11932.0,40323.50,0.001120,4,754 days 02:00:00,1085880.0,0 days 01:00:00
3631,2021-05-24 16:30:00+00:00,626.195,627.67,625.04,627.67,11776.0,40224.95,0.003469,4,754 days 03:00:00,1085940.0,0 days 01:00:00
3632,2021-05-24 17:30:00+00:00,627.620,629.64,627.42,628.35,20193.0,40390.30,0.001083,4,754 days 04:00:00,1086000.0,0 days 01:00:00
3633,2021-05-24 18:30:00+00:00,628.860,629.51,626.32,626.48,9739.0,39795.35,-0.002976,4,754 days 05:00:00,1086060.0,0 days 01:00:00


In [25]:
df['time_since_last_event'] = pd.to_timedelta(df['time_since_last_event'])

df['time_since_last_event_seconds'] = df['time_since_last_event'].dt.total_seconds() / 60
df

Unnamed: 0,time,open,high,low,close,Volume,Volume MA,pctc,mark,time_since_start,time_since_start_seconds,time_since_last_event,time_since_last_event_seconds
0,2019-05-01 13:30:00+00:00,182.890,184.50,181.71,182.08,87009.0,58317.80,0.000000,4,0 days 00:00:00,0.0,0 days 00:00:00,0.0
1,2019-05-01 14:30:00+00:00,182.250,183.31,181.39,181.48,32466.0,54655.85,-0.003295,4,0 days 01:00:00,60.0,0 days 01:00:00,60.0
2,2019-05-01 15:30:00+00:00,181.280,183.50,181.06,183.02,41981.0,53202.35,0.008486,4,0 days 02:00:00,120.0,0 days 01:00:00,60.0
3,2019-05-01 16:30:00+00:00,182.940,183.11,182.10,182.86,24737.0,51067.20,-0.000874,4,0 days 03:00:00,180.0,0 days 01:00:00,60.0
4,2019-05-01 17:30:00+00:00,182.880,184.77,182.77,184.71,33681.0,49363.50,0.010117,4,0 days 04:00:00,240.0,0 days 01:00:00,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3630,2021-05-24 15:30:00+00:00,625.290,626.64,623.82,625.50,11932.0,40323.50,0.001120,4,754 days 02:00:00,1085880.0,0 days 01:00:00,60.0
3631,2021-05-24 16:30:00+00:00,626.195,627.67,625.04,627.67,11776.0,40224.95,0.003469,4,754 days 03:00:00,1085940.0,0 days 01:00:00,60.0
3632,2021-05-24 17:30:00+00:00,627.620,629.64,627.42,628.35,20193.0,40390.30,0.001083,4,754 days 04:00:00,1086000.0,0 days 01:00:00,60.0
3633,2021-05-24 18:30:00+00:00,628.860,629.51,626.32,626.48,9739.0,39795.35,-0.002976,4,754 days 05:00:00,1086060.0,0 days 01:00:00,60.0


In [26]:
df.to_csv('NVDA_event.csv', index=False)