Flow Chart For Equity Transparency Calculations

In [0]:
pip install git+https://github.com/hadrilec/esma_data_py.git

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting git+https://github.com/hadrilec/esma_data_py.git
  Cloning https://github.com/hadrilec/esma_data_py.git to /tmp/pip-req-build-s6z8p4du
  Running command git clone --filter=blob:none --quiet https://github.com/hadrilec/esma_data_py.git /tmp/pip-req-build-s6z8p4du
  Resolved https://github.com/hadrilec/esma_data_py.git to commit fc1e65e8ddd13a7726ecc45725ca40c3e3e96edc
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import esma_data_py as edp

In [0]:
from src.repo_utils.utils import append_repo_path
append_repo_path("REPO_FITRS")

# from src.mifid.utils.bundle_plots_and_send2me import bundle_plots_and_send2me

In [0]:
from src.mifid.utils.spark_interaction import save_spark_dynamic, save_spark_df

import os
from pathlib import Path
import plotly.express as px
import re
import plotly.graph_objects as go
from matplotlib.colors import to_rgb
import pandas as pd

In [0]:
def hex_to_rgba(hex_color, alpha=1.0):
    hex_color = hex_color.lstrip('#')
    r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
    return f'rgba({r}, {g}, {b}, {alpha})'

def rgb_to_rgba(rgb_color, alpha=1.0):
    rgb_color = rgb_color[rgb_color.find('(')+1:rgb_color.find(')')]
    r, g, b = map(int, rgb_color.split(','))
    return f'rgba({r}, {g}, {b}, {alpha})'

def named_to_rgba(named_color, alpha=1.0):    
    r, g, b = to_rgb(named_color)
    return f'rgba({int(r*255)}, {int(g*255)}, {int(b*255)}, {alpha})'

def convert_to_rgba(color, alpha=1.0):
    if color.startswith('#'):
        return hex_to_rgba(color, alpha)
    elif color.startswith('rgb'):
        return rgb_to_rgba(color, alpha)
    else:
        return named_to_rgba(color, alpha)

colors = px.colors.qualitative.Set1 + \
  px.colors.qualitative.Set2 + \
  px.colors.qualitative.Set3 + \
  px.colors.qualitative.Pastel1 + \
  px.colors.qualitative.Pastel2 

opacity = 0.3
rgba_colors = [convert_to_rgba(color, alpha=opacity) for color in colors]

def make_sankey_plot(df, id1, id2, value, title='', colors=colors, title_left="<b>Year-1<b>", title_right = "<b>Year<b>"):

    df = df.drop_duplicates()
    
    df[id1], df[id2], df[value] = df[id1].astype(str), df[id2].astype(str), df[value].astype(str)

    list_list_thrs = list(set(list(df[id1]) + list(df[id2])))
    list_list_thrs = sorted([str(s) for s in list_list_thrs])

    df_lis_label = (pd.DataFrame({id1: list_list_thrs})
                    .sort_values([id1], ascending=True)
                    .reset_index(drop=True)
                    .assign(id_ = lambda x: x.apply(lambda y: y.name, axis=1))
                    .rename(columns = {'id_': id1 + '_id'})
                    )

    df_lis_fitrs_label = (pd.DataFrame({id2: list_list_thrs})
                    .sort_values([id2], ascending=True)
                    .reset_index(drop=True)
                    .assign(id_ = lambda x: x.apply(lambda y: y.name + len(df_lis_label.index) , axis=1))
                    .rename(columns = {'id_': id2 + '_id'})
                    )
    
    while len(colors) < len(list_list_thrs):
        colors += colors

    opacity = 0.3
    rgba_colors = [convert_to_rgba(color, alpha=opacity) for color in colors]
        
    df_colors = pd.DataFrame({'list_id': list_list_thrs,
                        "color": colors[:len(list_list_thrs)],
                        "color_light": rgba_colors[:len(list_list_thrs)]})

    df_colors2 = (pd.concat([df_lis_fitrs_label.rename(columns = {id2: 'list_id', id2 + '_id': 'id_'}),
                            df_lis_label.rename(columns = {id1: 'list_id', id1 + '_id': 'id_'})])              
                .merge(df_colors, on="list_id", how='left')
                .sort_values(['id_'], ascending=True)
                )
    
    data = (df[[id1, id2, value]]
       .groupby([id1, id2], as_index=False)
       .count()
       .merge(df_lis_fitrs_label, on=id2, how='left')
       .merge(df_lis_label, on=id1, how='left')
       .merge(df_colors.rename(columns = {'list_id': id1}), on=id1, how='left')
       )
    
    n_isin_same_bucket = sum(data.query(f"{id1} == {id2}")[value]) 
    n_isin_different_bucket = sum(data[value]) - sum(data.query(f"{id1} == {id2}")[value]) 
    pct_isin_same_bucket = n_isin_same_bucket / sum(data[value]) * 100
    pct_isin_different_bucket = n_isin_different_bucket / sum(data[value]) * 100

    add_caption = f"<br>Instruments whose indicator matches : <b>{pct_isin_same_bucket:.1f}%<b> ({n_isin_same_bucket:,.0f})"
    add_caption += f"<br>Instruments whose indicator changes : <b>{pct_isin_different_bucket:.1f}%<b> ({n_isin_different_bucket:,.0f})"
    add_caption += f"<br>Total number of instruments: {sum(data[value]):,.0f}"
    data['matching_ratio'] = pct_isin_same_bucket
    
    title = "<b>" + title + "<b>"
    title += add_caption
    
    source = list(data[id2 + '_id'])
    target = list(data[id1 + '_id'])
    values = list(data[value])
    color = list(data['color_light'])

    fig = go.Figure(data=[go.Sankey(
        node = dict(
        pad = 15,
        thickness = 20,
        line = dict(color = "black", width = 0.5),
        label = list(df_colors2['list_id']),
        color = list(df_colors2['color']),
        ),
        link = dict(
        source = source, 
        target = target, 
        value = values, 
        label =  values,
        color =  color
    ))])

    fig = fig.update_layout(
        font_size=10,  width=700, height=700, plot_bgcolor='white',
        xaxis={
        'showgrid': False,
        'zeroline': False,
        'visible': False, 
        },
        yaxis={
        'showgrid': False,
        'zeroline': False,
        'visible': False,
        },
        title={
            'text' : title,
            'y': 0.98
        })
    
    for x_coordinate, column_name in enumerate([title_left, title_right]):
        fig = fig.add_annotation(
                x=x_coordinate,
                y=1.05,
                xref="x",
                yref="paper",
                text=column_name,
                showarrow=False,
                font=dict(
                    size=16,
                    ),
                align="center",
                )

    return fig, data

In [0]:
#
# FYI All informations about the data and their classification can be found at the following 
# https://www.esma.europa.eu/sites/default/files/2023-12/ESMA65-8-5240_FIRDS_Transparency_Download_Instructions.pdf

In [0]:
files = edp.mifid.get_mifid_file_list()

files = files.query("instrument_type == 'Equity Instruments'") \
     .query("file_type == 'Full'") \
     .query("file_name.str.contains('_E_')") \
     .query("creation_date >= '2024-03-01'") \
     .query("creation_date <= '2024-03-03'") \
     .reset_index(drop=True)

eqt_esma_file = list(files["download_link"])[0]
print(eqt_esma_file)

http://fitrs.esma.europa.eu/fitrs/FULECR_20240302_E_1of1.zip


In [0]:
df = edp.download_file(eqt_esma_file)

1/1 files


In [0]:
### FILTERING ONLY FOR RELEVANT RECORDS AND COLUMN FOR THE RESEARCH ###
df = df.query("EqtyTrnsprncyData_Mthdlgy == 'YEAR'").query("FrDtToDt_FrDt >= '2022-01-01'")
df = df.iloc[:, 3:16].drop('EqtyTrnsprncyData_TechRcrdId', axis = 1)
df

Unnamed: 0,FrDtToDt_ToDt,FrDtToDt_FrDt,EqtyTrnsprncyData_FinInstrmClssfctn,EqtyTrnsprncyData_Id,RlvntMkt_AvrgDalyNbOfTxs,RlvntMkt_Id,Sttstcs_AvrgDalyNbOfTxs,Sttstcs_LrgInScale,Sttstcs_AvrgDalyTrnvr,EqtyTrnsprncyData_Lqdty,Sttstcs_StdMktSz,Sttstcs_AvrgTxVal
11327,2022-12-31,2022-01-01,SHRS,LU2263803020,0.07393,XGAT,2.19845,15000,5111.86928,false,,
11328,2023-12-31,2023-01-01,SHRS,NL0010872420,614.176,XNCM,45.49804,15000,39385.45454,false,,
11329,2022-12-31,2022-01-01,SHRS,NL0010872420,1992.24,XGAT,60.28405,30000,90978.93887,false,,
11331,2023-12-31,2023-01-01,SHRS,NO0012953720,140.95313,XOAS,147.50782,30000,79195.1991,false,,
11333,2023-12-31,2023-01-01,SHRS,PR67103X1020,0.01177,FRAB,0.36079,15000,453.44441,false,,
...,...,...,...,...,...,...,...,...,...,...,...,...
157455,2023-12-31,2023-01-01,SHRS,US1258968379,0,FRAB,0,15000,0,false,,
157458,2022-12-31,2022-01-01,SHRS,US72942A1079,0.34631,XGAT,2.40078,15000,3754.12716,false,,
157459,2023-12-31,2023-01-01,SHRS,US72942A1079,0.27235,HAMN,8.6383,15000,1495.03474,false,,
157460,2022-12-31,2022-01-01,SHRS,US70465T1079,87.2,XGAT,18.1751,15000,21735.53947,false,,


In [0]:
def create_sankey_datasets(df):
    sankey_datasets = {}
    columns_to_process = [col for col in df.columns if col not in ['FrDtToDt_ToDt', 'FrDtToDt_FrDt', 'EqtyTrnsprncyData_Id']]
    df['FrDtToDt_ToDt'] = df['FrDtToDt_ToDt'].astype(str)
    
    # Split the data into 2022 and 2023 datasets
    df_2022 = df[df['FrDtToDt_ToDt'].astype(str).str.startswith('2022')]
    df_2023 = df[df['FrDtToDt_ToDt'].astype(str).str.startswith('2023')]
    
    # Dictionary to store sankey plots
    sankey_plots = {}

    for column in columns_to_process:
        # Create a subset for 2022 and 2023 for the current column
        df_2022_subset = df_2022[['EqtyTrnsprncyData_Id', column]].rename(columns={column: 'Year_2022'})
        df_2023_subset = df_2023[['EqtyTrnsprncyData_Id', column]].rename(columns={column: 'Year_2023'})
        
        # Perform an outer join on 'EqtyTrnsprncyData_Id' to align 2022 and 2023 data
        merged_df = pd.merge(df_2022_subset, df_2023_subset, on='EqtyTrnsprncyData_Id', how='outer')
        merged_df = merged_df.dropna(subset=['Year_2022', 'Year_2023'])
        
        # Store the resulting DataFrame in the dictionary
        sankey_datasets[column] = merged_df

        # Plotting the Sankey for this dataset
        sankey_plot, sk = make_sankey_plot(df = merged_df, id1 = 'Year_2022', id2='Year_2023', value="EqtyTrnsprncyData_Id", title=f"Sankey Plot for {column}", title_left="<b>Year 2022<b>", title_right = "<b>Year 2023<b>")

        # Store the plot object in the sankey_plots dictionary
        sankey_plots[column] = sankey_plot
    
    return sankey_datasets, sankey_plots

sankey_datasets,sankey_plots = create_sankey_datasets(x)

In [0]:
keys = ["RlvntMkt_Id", "Sttstcs_LrgInScale", "EqtyTrnsprncyData_Lqdty"]
# plot_keys = list(sankey_plots.keys())
for key in keys:
    print(f"Display Sankey plot for: {key}")
    display(sankey_plots[key])

Display Sankey plot for: RlvntMkt_Id


Display Sankey plot for: Sttstcs_LrgInScale


Display Sankey plot for: EqtyTrnsprncyData_Lqdty
