In [2]:
import pandas as pd
import os

In [3]:
def summarize_statistics(df, columns):
    """
    Calculate descriptive statistics for multiple columns in a DataFrame.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing the data.
        columns (list): A list of column names for which statistics are calculated.

    Returns:
        pd.DataFrame: A summary DataFrame containing descriptive statistics for the specified columns.
    """
    # Ensure the specified columns exist in the DataFrame
    missing_cols = [col for col in columns if col not in df.columns]
    if missing_cols:
        raise ValueError(f"The following columns are not in the DataFrame: {missing_cols}")

    # Initialize a dictionary to store statistics
    stats_dict = {
        "mean": [],
        "median": [],
        "std": [],
        "min": [],
        "25%": [],
        "50%": [],
        "75%": [],
        "max": []
    }

    # Calculate statistics for each column
    for col in columns:
        stats = df[col].describe()
        stats_dict["mean"].append(stats["mean"])
        stats_dict["median"].append(df[col].median())
        stats_dict["std"].append(df[col].std())
        stats_dict["min"].append(stats["min"])
        stats_dict["25%"].append(stats["25%"])
        stats_dict["50%"].append(stats["50%"])
        stats_dict["75%"].append(stats["75%"])
        stats_dict["max"].append(stats["max"])

    # Create a summary DataFrame
    summary_df = pd.DataFrame(stats_dict, index=columns)
    summary_df.index.name = "Variable"

    return summary_df

In [4]:
data_path = os.path.join("data", "block_trade_with_greeks.pkl")
df = pd.read_pickle(data_path)

In [5]:
df.columns

Index(['id', 'index', 'date', 'date_unixtime', 'contract_size', 'action',
       'contract_name', 'iv', 'premium', 'index_price', 'expiry', 'strike',
       'type', 'current_date', 'time_to_maturity', 'risk_free_rate',
       'unique_id', 'forward_price', 'Action_multiplier', 'total_premium',
       'Delta', 'Gamma', 'Vega', 'Theta'],
      dtype='object')

In [7]:
columns_to_summarize = ["contract_size", "total_premium", "time_to_maturity", "Delta", "Gamma", "Vega"]
summary = summarize_statistics(df, columns_to_summarize)
print(summary.T)

Variable  contract_size  total_premium  time_to_maturity        Delta  \
mean          44.097681   3.775338e+03          0.117595     0.426740   
median        25.000000  -3.254100e+01          0.049019     0.063457   
std           77.517658   6.805667e+05          0.165138    30.034537   
min            0.100000  -1.451001e+08          0.000004 -1949.944619   
25%           12.500000  -2.165762e+04          0.019288    -6.816164   
50%           25.000000  -3.254100e+01          0.049019     0.063457   
75%           50.000000   2.561362e+04          0.141776     7.209121   
max         3000.000000   1.361051e+08          1.019064  1594.734993   

Variable     Gamma           Vega  
mean      0.000198     173.823717  
median   -0.000002      -0.156047  
std       0.009256    7895.422678  
min      -0.314616 -387749.474748  
25%      -0.001286   -1005.221734  
50%      -0.000002      -0.156047  
75%       0.001489    1149.560947  
max       0.404951  379024.188208  


In [9]:
summary.T.to_csv("summary_stat.csv", index=True)