In [None]:
import os
import polars as pl
import dotenv
from tqdm import tqdm
import plotly.graph_objects as go
FOLDER_PATH = os.getenv("FOLDER_PATH")


dotenv.load_dotenv()

In [3]:
df =  df = pl.read_parquet(f"{FOLDER_PATH}/INTC/INTC_2024-07-22.parquet")

In [None]:
df.head()

In [14]:
num_entries_by_publisher = df.group_by("publisher_id").len().sort("len", descending=True)
if len(num_entries_by_publisher) > 1:
    df = df.filter(pl.col("publisher_id") == 41)

In [None]:
print(num_entries_by_publisher)

In [16]:
df = df.filter(
    (
        (pl.col("ts_event").dt.hour() == 9) & (pl.col("ts_event").dt.minute() >= 30) |
        (pl.col("ts_event").dt.hour() > 9) & (pl.col("ts_event").dt.hour() < 16) |
        (pl.col("ts_event").dt.hour() == 16) & (pl.col("ts_event").dt.minute() == 0)
    )
)

In [None]:
mid_price = (df["ask_px_00"] + df["bid_px_00"]) / 2        
# managing nans or infs, preceding value filling
mid_price = mid_price.fill_nan(mid_price.shift(1))

In [None]:

# Create figure
fig = go.Figure()

# Add best bid line
fig.add_trace(go.Scatter(
    x=df['ts_event'],
    y=df['bid_px_00'],
    mode='lines',
    name='Best Bid',
    line=dict(color='blue')
))

# Add best ask line  
fig.add_trace(go.Scatter(
    x=df['ts_event'], 
    y=df['ask_px_00'],
    mode='lines',
    name='Best Ask',
    line=dict(color='red')
))

fig.add_trace(go.Scatter(
    x=df['ts_event'],
    y=mid_price,
    mode='lines',
    name='Mid Price',
    line=dict(color='black')
))


# Update layout
fig.update_layout(
    title='Order Book and bid/ask',
    xaxis_title='Time',
    yaxis_title='Price',
    showlegend=True
)

fig.show()



In [None]:
pl.save_parquet(df, f"{FOLDER_PATH}/INTC/INTC_2024-07-22_curated.parquet")

In [None]:

for stock in tqdm(os.listdir(FOLDER_PATH)[0:1], desc="Stock treatment"):
    for file in tqdm(os.listdir(f"{FOLDER_PATH}/{stock}"), desc="File treatment"):
       
        if pl.col("publisher_id").n_unique() > 1:
            df = df.filter(pl.col("publisher_id") == 41)
        else:
            df = df.filter(pl.col("publisher_id") == 2)
            
            
        if stock == "GOOGL":
            df = df.filter(pl.col("ts_event").dt.hour() >= 13)
            df = df.filter(pl.col("ts_event").dt.hour() <= 20)
        else:
            df = df.filter(
                (
                    (pl.col("ts_event").dt.hour() == 9) & (pl.col("ts_event").dt.minute() >= 30) |
                    (pl.col("ts_event").dt.hour() > 9) & (pl.col("ts_event").dt.hour() < 16) |
                    (pl.col("ts_event").dt.hour() == 16) & (pl.col("ts_event").dt.minute() == 0)
                )
            )
        
        mid_price = (df["ask_px_00"] + df["bid_px_00"]) / 2
        
        # managing nans or infs, preceding value filling
        mid_price = mid_price.fill_nan(mid_price.shift(1)).fill_inf(mid_price.shift(1)).fill_null(mid_price.shift(1))
        df = df.with_columns(mid_price=mid_price)
        df.write_parquet(f"{FOLDER_PATH}/{stock}/{file[:-9]}_curated.parquet")
        






