In [1]:
import os
import json
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt


In [8]:
# Fonction pour charger les données JSON
def load_json(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Charger les fichiers JSON
condition = load_json('/users/eleves-a/2022/janis.aiad/3A/EAP1/HFT_QR_RL/HFT_QR_RL/data/smash2/data/dbn/condition.json')
manifest = load_json('/users/eleves-a/2022/janis.aiad/3A/EAP1/HFT_QR_RL/HFT_QR_RL/data/smash2/data/dbn/manifest.json')
metadata = load_json('/users/eleves-a/2022/janis.aiad/3A/EAP1/HFT_QR_RL/HFT_QR_RL/data/smash2/data/csv/metadata.json')

# Fonction pour charger les données CSV
def load_csv(stock, date):
    file_path = f'/users/eleves-a/2022/janis.aiad/3A/EAP1/HFT_QR_RL/HFT_QR_RL/data/smash2/data/csv/{stock}/{date}.csv'
    return pd.read_csv(file_path)

# Spécifier les dates et les stocks
dates = ["20240624", "20240625", "20240626", "20240627", "20240628", "20240701", "20240702", "20240703", "20240705", "20240708", "20240709", "20240710", "20240711", "20240712", "20240715", "20240716", "20240717", "20240718", "20240719", "20240722", "20240723", "20240724", "20240725", "20240726", "20240729", "20240730", "20240731", "20240801", "20240802", "20240805", "20240806", "20240807", "20240808"]
stocks = ["HL"]
# Charger les données pour chaque stock et chaque date dans des datasets différents
data_dict = {}
for stock in stocks:
    data_dict[stock] = {}
    for date in dates:
        data_dict[stock][date] = load_csv(stock, date).sample(frac=0.1, random_state=1)
# Concaténer toutes les données
data_list = [data_dict[stock][date] for stock in stocks for date in dates]
data = pd.concat(data_list, ignore_index=True)

# Filtrer par publisher_id = 39
data = data[data['publisher_id'] == 39]

# Convertir ts_event en datetime
data['ts_event'] = pd.to_datetime(data['ts_event'], utc=True)
data = data.sort_values(by='ts_event')


In [10]:
data.head()

Unnamed: 0,ts_event,rtype,publisher_id,instrument_id,action,side,depth,price,size,flags,...,ask_sz_08,bid_ct_08,ask_ct_08,bid_px_09,ask_px_09,bid_sz_09,ask_sz_09,bid_ct_09,ask_ct_09,symbol
5419,2024-06-24 13:29:35.778062086+00:00,10,39,7733,A,B,0,5.11,1200,130,...,0,0,0,,,0,0,0,0,HL
4004,2024-06-24 13:30:00.974347194+00:00,10,39,7733,A,A,0,5.23,600,130,...,0,0,0,,,0,0,0,0,HL
143,2024-06-24 13:30:01.001463467+00:00,10,39,7733,A,B,0,5.21,600,130,...,0,0,0,,,0,0,0,0,HL
4188,2024-06-24 13:30:01.192537221+00:00,10,39,7733,A,A,0,5.23,600,130,...,0,0,0,,,0,0,0,0,HL
5694,2024-06-24 13:30:03.612878176+00:00,10,39,7733,C,B,0,5.2,600,130,...,0,0,0,,,0,0,0,0,HL


In [14]:
from scipy.stats import pearsonr

# Fonction pour calculer la corrélation avec la fonction identité
def calculate_correlation(data, bucket_number):
    data['imbalance_bucket'] = data['imbalance'].apply(lambda x: round(bucket_number * x) if pd.notna(x) else None)
    data = data.dropna(subset=['imbalance_bucket'])
    mean_delta_mid_price = data.groupby('imbalance_bucket')['delta_mid_price'].mean().reset_index()
    correlation, _ = pearsonr(mean_delta_mid_price['imbalance_bucket'], mean_delta_mid_price['delta_mid_price'])
    return correlation

# Fonction pour tracer et sauvegarder les graphiques pour chaque jour
def plot_and_save(data, date, bucket_number):
    data['imbalance_bucket'] = data['imbalance'].apply(lambda x: round(bucket_number * x) if pd.notna(x) else None)
    data = data.dropna(subset=['imbalance_bucket'])
    mean_delta_mid_price = data.groupby('imbalance_bucket')['delta_mid_price'].mean().reset_index()
    
    plt.figure(figsize=(14, 7))
    plt.plot(mean_delta_mid_price['imbalance_bucket'], mean_delta_mid_price['delta_mid_price'], marker='o', label='Mean Delta Mid Price')
    plt.title(f"Mean Delta Mid Price in Horizon 10 Trades vs Imbalance Buckets for {date}")
    plt.xlabel(f"Imbalance Buckets (round({bucket_number}*imbalance))")
    plt.ylabel("Mean Delta Mid Price in Horizon 10 Trades")
    plt.legend()
    plt.savefig(f"imbalance_plot_{date}.png")
    plt.close()

# Calculer l'imbalance des meilleures offres et demandes
for df in data_list:
    df['imbalance'] = (df['bid_sz_00'] - df['ask_sz_00']) / (df['bid_sz_00'] + df['ask_sz_00'])

# Calculer le prix moyen
for df in data_list:
    df['mid_price'] = (df['bid_px_00'] + df['ask_px_00']) / 2

# Calculer le delta du prix moyen sur un horizon de 10 transactions
for df in data_list:
    df['delta_mid_price'] = df['mid_price'].diff(periods=100)

# Filtrer les données pour ne conserver que les lignes où delta_mid_price n'est pas NaN
data_filtered_list = [df.dropna(subset=['delta_mid_price']) for df in data_list]

# Liste pour stocker les meilleurs bucket_numbers
best_buckets = []

# Appliquer la fonction pour chaque jour
for date, df in zip(dates, data_filtered_list):
    if not df.empty:
        best_correlation = -1
        best_bucket_number = 5
        for bucket_number in range(5, 11):
            correlation = calculate_correlation(df, bucket_number)
            if correlation > best_correlation:
                best_correlation = correlation
                best_bucket_number = bucket_number
        best_buckets.append((date, best_bucket_number))
        plot_and_save(df, date, best_bucket_number)

# Afficher la liste des meilleurs bucket_numbers
print(best_buckets)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

[('20240624', 7), ('20240625', 5), ('20240626', 8), ('20240627', 6), ('20240628', 9), ('20240701', 6), ('20240702', 6), ('20240703', 8), ('20240705', 6), ('20240708', 5), ('20240709', 5), ('20240710', 5), ('20240711', 5), ('20240712', 5), ('20240715', 10), ('20240716', 6), ('20240717', 6), ('20240718', 5), ('20240719', 8), ('20240722', 5), ('20240723', 6), ('20240724', 5), ('20240725', 8), ('20240726', 5), ('20240729', 5), ('20240730', 6), ('20240731', 6), ('20240801', 5), ('20240802', 5), ('20240805', 9), ('20240806', 5), ('20240807', 6), ('20240808', 5)]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [5]:
print(data)

                                 ts_event  rtype  publisher_id  instrument_id  \
30    2024-06-26 13:30:01.145614335+00:00     10            39           7733   
63    2024-06-26 13:30:04.253249658+00:00     10            39           7733   
77    2024-06-26 13:30:05.756353339+00:00     10            39           7733   
88    2024-06-26 13:30:06.033855178+00:00     10            39           7733   
140   2024-06-26 13:30:10.860379015+00:00     10            39           7733   
...                                   ...    ...           ...            ...   
73218 2024-06-26 19:59:59.022678553+00:00     10            39           7733   
73230 2024-06-26 19:59:59.987236162+00:00     10            39           7733   
73234 2024-06-26 19:59:59.988039341+00:00     10            39           7733   
73235 2024-06-26 19:59:59.988079522+00:00     10            39           7733   
73237 2024-06-26 19:59:59.993085356+00:00     10            39           7733   

      action side  depth  p