In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/Simulation/

/content/drive/MyDrive/Simulation


In [4]:
import os
import json
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import itertools
from statsmodels.tsa.stattools import adfuller

import seaborn as sns
from sklearn.preprocessing import StandardScaler
import pickle


In [5]:
def data_read_manip(symbol,year,month):
  use_tag = 'Time Series (1min)'

  file_path = year + '/' + symbol + '/' + year + '_' + month + '.json'
  with open(file_path, 'r', encoding='utf-8') as f:
      data = json.load(f)

  list_date_time = []
  list_high_price = []
  list_low_price = []
  list_open_price = []
  list_close_price = []
  list_volume = []

  for date_time,price_data in data[use_tag].items():
      high_price = float(price_data['2. high'])
      low_price = float(price_data['3. low'])
      open_price = float(price_data['1. open'])
      close_price = float(price_data['4. close'])
      volume = int(price_data['5. volume'])


      list_date_time.insert(0,date_time)
      list_high_price.insert(0,high_price)
      list_low_price.insert(0,low_price)
      list_open_price.insert(0,open_price)
      list_close_price.insert(0,close_price)
      list_volume.insert(0,volume)

  df = pd.DataFrame({'Time':list_date_time,'Close':list_close_price,'Volume':list_volume})

  df['Time'] = pd.to_datetime(df['Time'], format='%Y-%m-%d %H:%M:%S')
  # df.set_index('Time', inplace=True)

  return df

In [6]:
symbol = 'VOO'
year = '2010'

In [7]:
data = data_read_manip(symbol,year,'10')

for month in ['11','12']:
  data = pd.concat([data,data_read_manip(symbol,year,month)])

In [8]:
for year in ['2011','2012','2013','2014','2015','2016','2017','2018','2019','2020','2021','2022','2023','2024']:
  print("We are in year: ",year)
  for month in ['01','02','03','04','05','06','07','08','09','10','11','12']:
    data = pd.concat([data,data_read_manip(symbol,year,month)])

We are in year:  2011
We are in year:  2012
We are in year:  2013
We are in year:  2014
We are in year:  2015
We are in year:  2016
We are in year:  2017
We are in year:  2018
We are in year:  2019
We are in year:  2020
We are in year:  2021
We are in year:  2022
We are in year:  2023
We are in year:  2024


In [9]:
symbol = 'VOO'
year = '2025'
for month in ['01','02','03']:
  data = pd.concat([data,data_read_manip(symbol,year,month)])

In [10]:
data = data.sort_values('Time').set_index('Time')

# 2. Define Market Hours (adjust if your data is not in Eastern Time)
# VOO trades 09:30 to 16:00
market_open = pd.Timestamp("09:30").time()
market_close = pd.Timestamp("16:00").time()

# 3. Filter for Intraday minutes only
# This removes pre-market and post-market activity if present
data_intraday = data.between_time(market_open, market_close).copy()

# 4. Calculate returns group by Date
# This prevents the first minute of Monday from calculating a return against Friday close
data_intraday['date'] = data_intraday.index.date
data_intraday['return'] = data_intraday.groupby('date')['Close'].pct_change()

# 5. Separate the "Overnight Gaps" into a different library for later use
# Gap = (Today's Open / Yesterday's Close) - 1
daily_open_close = data.groupby(data.index.date)['Close'].agg(['first', 'last'])
overnight_gaps = (daily_open_close['first'] / daily_open_close['last'].shift(1)) - 1
overnight_gaps = overnight_gaps.dropna()

# 6. Final cleaning for the Markov Model
# We drop NaNs (the first minute of every day) to get clean transitions
clean_data = data_intraday.dropna(subset=['return']).copy()

print(f"Total Intraday Minutes: {len(clean_data)}")
print(f"Total Overnight Gaps Captured: {len(overnight_gaps)}")

Total Intraday Minutes: 1295208
Total Overnight Gaps Captured: 3645


In [11]:
# Saving as Parquet
clean_data.to_parquet('Intraday_Clean_Data/VOO_clean_intraday_2010_2025.parquet')