In [1]:
import pandas as pd

### Video transcript & metadata dataset statistics

In [4]:
df = pd.read_csv("../data/yt_metadata/video_metadata.csv", sep=";")
df.columns

Index(['uploader_id', 'video_id', 'upload_date', 'yt_video_type', 'view_count',
       'duration', 'language', 'title', 'description', 'yt_auto_categories',
       'tags', 'first_three_tags', 'like_count', 'comment_count', 'age_limit',
       'chapters', 'uploader', 'has_chapters', 'transcript'],
      dtype='object')

In [7]:
print(f"General")
print(f"unique channels: {df['uploader_id'].nunique()}")
print(f"unique videos: {df['video_id'].nunique()}")
print(f"n normal videos: {df[df['yt_video_type'] == 'video'].shape[0]}")
print(f"n shorts: {df[df['yt_video_type'] == 'short'].shape[0]}")
print(f"Engagement")
print(f"total views: {round(df['view_count'].sum())}")
print(f"mean video views: {round(df['view_count'].mean())}")
print(f"median video views: {round(df['view_count'].median())}")
print(f"total likes: {round(df['like_count'].sum())}")
print(f"mean video likes: {round(df['like_count'].mean())}")
print(f"median video likes: {round(df['like_count'].median())}")
print(f"total comments: {round(df['comment_count'].sum())}")
print(f"mean video comments: {round(df['comment_count'].mean())}")
print(f"median video comments: {round(df['comment_count'].median())}")

General
unique channels: 231
unique videos: 45968
n normal videos: 38985
n shorts: 6983
Engagement
total views: 10066546816
mean video views: 218990
median video views: 35984
total likes: 453588875
mean video likes: 9899
median video likes: 1290
total comments: 17806533
mean video comments: 396
median video comments: 157


### Asset Data Statistics

In [12]:
# load price data dfs
import pandas as pd
name_list = ["stocks", "etfs", "cryptos", "commodities"]
source_list = ["eodhd", "eodhd", "eodhd", "yahoo finance & eodhd"]
exchanges_list = ["NYSE, NASDAQ", "NYSE, NASDAQ, NYSE Arca", "n/a", "n/a"]
dfs = {}
for n, s, e in zip(name_list, source_list, exchanges_list):
    dfs[n] = pd.read_csv(f"../data/asset_data/names_and_tickers_with_price_availability/{n}.csv", sep=";")
    print(f"{n}")
    print(f"data source: {s}")
    print(f"exchanges covered: {e}")
    #print(f"total assets: {dfs[n].shape[0]}")
    print(f"assets with (any) price data during observation period: {dfs[n]['has_price_data'].sum()}")
    #print(f"assets with likely data errors: {dfs[n]['price_data_errors_likely'].sum()}")
    print(f"assets meeting data quality standard: {dfs[n]['has_returns'].sum()}")

stocks
data source: eodhd
exchanges covered: NYSE, NASDAQ
assets with (any) price data during observation period: 13139
assets meeting data quality standard: 10947
etfs
data source: eodhd
exchanges covered: NYSE, NASDAQ, NYSE Arca
assets with (any) price data during observation period: 3730
assets meeting data quality standard: 3182
cryptos
data source: eodhd
exchanges covered: n/a
assets with (any) price data during observation period: 6781
assets meeting data quality standard: 4576
commodities
data source: yahoo finance & eodhd
exchanges covered: n/a
assets with (any) price data during observation period: 26
assets meeting data quality standard: 26


In [3]:
dfs["stocks"].columns

Index(['Code', 'Name', 'Country', 'Exchange', 'Currency', 'Type', 'Isin',
       'delisted_as_of_may_2024', 'in_sp500_as_of_may_2024',
       'earliest_price_date', 'latest_price_date', 'has_price_data',
       'n_trading_days_between_earliest_and_latest',
       'n_prices_between_earliest_and_latest',
       'n_missing_prices_between_earliest_and_latest', 'missing_prices_ratio',
       'longest_missing_price_sequence', 'price_data_errors_likely',
       'has_returns'],
      dtype='object')

### Extractions Overview

In [1]:
# load video-level matching data
import pandas as pd
import json
path = f"../data/matched/VIDEOS_inf_llama3_ft_v4_q8_0_llamacpp_guided.csv"
df = pd.read_csv(path, sep=";")
# the trade_info_incl_neutrals column contains the essential extraction information (ticker, asset_type, sentiment)
df["trade_info_incl_neutrals"] = df["trade_info_incl_neutrals"].apply(json.loads)
# remove rows with empty trade_info_incl_neutrals
df = df[~df["trade_info_incl_neutrals"].isna()]
# explode json list in trade_info_incl_neutrals column
df = df.explode("trade_info_incl_neutrals")
print(df.shape)
df["ticker"] = df["trade_info_incl_neutrals"].apply(lambda x: pd.NA if pd.isna(x) else x["ticker"])
df["asset_type"] = df["trade_info_incl_neutrals"].apply(lambda x: pd.NA if pd.isna(x) else x["asset_type"])
df["sentiment"] = df["trade_info_incl_neutrals"].apply(lambda x: pd.NA if pd.isna(x) else x["sentiment"])

# crosstabs
ct_totals = pd.crosstab(df['asset_type'], df['sentiment'], margins=True)
ct_perc = pd.crosstab(df['asset_type'], df['sentiment'], normalize='all', margins=True) * 100

print("totals:")
print(ct_totals.reindex(["stock", "crypto", "etf", "commodity", "All"]))

print("\npercentages:")
print(ct_perc.round(1).reindex(["stock", "crypto", "etf", "commodity", "All"]))

# combined totals and percentages
#ct_combined = pd.concat([ct_totals, ct_perc], keys=["totals", "percentages"])
#print("\nCombined crosstab:")
#print(ct_combined)

Index(['video_id', 'extractions_all', 'extractions_dedup_retain_unmatched',
       'extractions_dedup', 'trade_info_incl_neutrals',
       'trade_info_no_neutrals'],
      dtype='object')


In [6]:
# load video-level matching data
import pandas as pd
import json
path = f"../data/matched/VIDEOS_inf_llama3_ft_v4_q8_0_llamacpp_guided.csv"
df = pd.read_csv(path, sep=";")

# mark empty extraction lists
df["is_empty"] = df["trade_info_incl_neutrals"].apply(lambda x: x == "[]")
print(df["is_empty"].value_counts())
print(df["is_empty"].value_counts(normalize=True) * 100)

is_empty
True     27919
False    18048
Name: count, dtype: int64
is_empty
True     60.73705
False    39.26295
Name: proportion, dtype: float64


### Rec preceding and following return analysis

In [29]:
### buy_asset_class_... two tables
# load 
import pandas as pd
df = pd.read_csv("../data/rec_analysis/buy_asset_class_abs_prefol_analysis_results.csv", sep=";")
print(df.columns)
# drop 
df = df.drop(columns=["sentiment", "pre_or_post"])
# rounding and formatting
q_cols = ["q10", "q33", "q50", "q66", "q90"]
test_cols = ["p_two_sided", "p_greater", "p_less"]
df[q_cols] = df[q_cols].map(lambda x: f"{x*100:.1f}") # convert to percentage, 1 decimal place
df[test_cols] = df[test_cols].map(lambda x: f"{x:.5f}") # 5 decimal places
print(df.to_latex(index=False))

Index(['pre_or_post', 'sentiment', 'asset_class', 'time_period', 'n', 'q10',
       'q33', 'q50', 'q66', 'q90', 'p_two_sided', 'p_greater', 'p_less'],
      dtype='object')
\begin{tabular}{llrllllllll}
\toprule
asset_class & time_period & n & q10 & q33 & q50 & q66 & q90 & p_two_sided & p_greater & p_less \\
\midrule
stock & 1w & 24157 & -8.7 & -1.5 & 0.9 & 3.3 & 14.7 & 0.00000 & 0.00000 & 1.00000 \\
stock & 1m & 24049 & -16.4 & -2.5 & 2.7 & 8.3 & 36.1 & 0.00000 & 0.00000 & 1.00000 \\
stock & 1y & 21956 & -24.7 & 10.0 & 32.3 & 67.7 & 469.0 & 0.00000 & 0.00000 & 1.00000 \\
etf & 1w & 1501 & -4.5 & -0.5 & 0.4 & 1.4 & 5.2 & 0.00000 & 0.00000 & 1.00000 \\
etf & 1m & 1497 & -7.9 & -0.1 & 1.7 & 3.7 & 12.4 & 0.00000 & 0.00000 & 1.00000 \\
etf & 1y & 1461 & -8.8 & 8.8 & 18.3 & 31.0 & 79.9 & 0.00000 & 0.00000 & 1.00000 \\
crypto & 1w & 11171 & -15.3 & -2.3 & 3.5 & 11.0 & 37.3 & 0.00000 & 0.00000 & 1.00000 \\
crypto & 1m & 11075 & -24.5 & -0.5 & 17.4 & 37.5 & 137.1 & 0.00000 & 0.00000 & 1.00000 \

In [34]:
### buy_sell_... two tables
# load 
import pandas as pd
df = pd.read_csv("../data/rec_analysis/buy_sell_abs_prefol_analysis_results.csv", sep=";")
print(df.columns)
# drop 
df = df.drop(columns=["pre_or_post"])
# rounding and formatting
q_cols = ["q10", "q33", "q50", "q66", "q90"]
test_cols = ["p_two_sided", "p_greater", "p_less"]
df[q_cols] = df[q_cols].map(lambda x: f"{x*100:.1f}") # convert to percentage, 1 decimal place
df[test_cols] = df[test_cols].map(lambda x: f"{x:.5f}") # 5 decimal places
print(df.to_latex(index=False))

Index(['pre_or_post', 'sentiment', 'time_period', 'n', 'q10', 'q33', 'q50',
       'q66', 'q90', 'p_two_sided', 'p_greater', 'p_less'],
      dtype='object')
\begin{tabular}{llrllllllll}
\toprule
sentiment & time_period & n & q10 & q33 & q50 & q66 & q90 & p_two_sided & p_greater & p_less \\
\midrule
buy & 1w & 37687 & -10.5 & -1.5 & 1.1 & 4.2 & 20.9 & 0.00000 & 0.00000 & 1.00000 \\
buy & 1m & 37478 & -18.4 & -2.0 & 3.9 & 12.0 & 62.2 & 0.00000 & 0.00000 & 1.00000 \\
buy & 1y & 33424 & -20.9 & 18.6 & 52.2 & 153.1 & 1121.5 & 0.00000 & 0.00000 & 1.00000 \\
sell & 1w & 3067 & -14.9 & -5.3 & -1.4 & 1.3 & 11.8 & 0.00000 & 1.00000 & 0.00000 \\
sell & 1m & 3046 & -26.6 & -8.2 & -0.9 & 4.8 & 37.9 & 0.00033 & 0.99983 & 0.00017 \\
sell & 1y & 2810 & -49.1 & -3.3 & 24.2 & 70.5 & 566.6 & 0.00000 & 0.00000 & 1.00000 \\
buy & 1w & 37819 & -12.1 & -2.6 & 0.2 & 2.6 & 13.6 & 0.20319 & 0.10160 & 0.89840 \\
buy & 1m & 37777 & -24.8 & -5.7 & 0.3 & 5.4 & 28.5 & 0.01328 & 0.99336 & 0.00664 \\
buy & 1y & 36984

### Most Common Recommendations

In [1]:
import json
import pandas as pd

# load extractions data (video-level)
edf = pd.read_csv("../data/matched/VIDEOS_inf_llama3_ft_v4_q8_0_llamacpp_guided.csv", sep=";")

extractions_col = "trade_info_incl_neutrals"
edf = edf[[extractions_col]]
# remove rows with empty extractions
edf = edf[edf[extractions_col] != "[]"]
edf.loc[:, extractions_col] = edf[extractions_col].apply(json.loads)
edf = edf.explode(extractions_col).reset_index(drop=True) # explode rec lists (1 row per rec)
for attr in ["asset_type", "ticker", "sentiment"]:
    edf[attr] = edf[extractions_col].apply(lambda x: pd.NA if pd.isna(x) else x[attr])

# load names and tickers data
nat_path = "../data/asset_data/names_and_tickers_with_price_availability"
nat_stocks = pd.read_csv(f"{nat_path}/stocks.csv", sep=";")[["Code", "Name"]]
nat_cryptos = pd.read_csv(f"{nat_path}/cryptos.csv", sep=";")[["Code_clean", "Name"]].rename(columns={"Code_clean": "Code"})
nat_etfs = pd.read_csv(f"{nat_path}//etfs.csv", sep=";")[["Code", "Name"]]
nat_commodities = pd.read_csv(f"{nat_path}/commodities.csv", sep=";")[["Code", "Name"]]

# get buy recommendation value counts for each asset type and join names
top_n = 25
sents = ["buy"]

all_stocks = edf[(edf["asset_type"] == "stock") & edf["sentiment"].isin(sents)]["ticker"].value_counts().reset_index()
all_stocks["pct"] = all_stocks["count"] / all_stocks["count"].sum() * 100
all_stocks = all_stocks.merge(nat_stocks, left_on="ticker", right_on="Code", how="left")[["count", "pct", "ticker", "Name"]]

all_cryptos = edf[(edf["asset_type"] == "crypto") & edf["sentiment"].isin(sents)]["ticker"].value_counts().reset_index()
all_cryptos["pct"] = all_cryptos["count"] / all_cryptos["count"].sum() * 100
all_cryptos = all_cryptos.merge(nat_cryptos, left_on="ticker", right_on="Code", how="left")[["count", "pct", "ticker", "Name"]]

all_etfs = edf[(edf["asset_type"] == "etf") & edf["sentiment"].isin(sents)]["ticker"].value_counts().reset_index()
all_etfs["pct"] = all_etfs["count"] / all_etfs["count"].sum() * 100
all_etfs = all_etfs.merge(nat_etfs, left_on="ticker", right_on="Code", how="left")[["count", "pct", "ticker", "Name"]]

all_commodities = edf[(edf["asset_type"] == "commodity") & edf["sentiment"].isin(sents)]["ticker"].value_counts().reset_index()
all_commodities["pct"] = all_commodities["count"] / all_commodities["count"].sum() * 100
all_commodities = all_commodities.merge(nat_commodities, left_on="ticker", right_on="Code", how="left")[["count", "pct", "ticker", "Name"]]

In [4]:
# number of total unique recommended assets
edf["unique_ticker"] = edf["ticker"] + "_" + edf["asset_type"]
unique_tickers = edf["unique_ticker"].nunique()
print(f"unique tickers: {unique_tickers}")

unique tickers: 5218


In [34]:
top10table = pd.concat([all_stocks.head(10), all_cryptos.head(10)], axis=1)
top10table = pd.concat([top10table, pd.concat([all_etfs.head(10), all_commodities.head(10)], axis=1)], axis=0)
# forma pct columns
top10table["pct"] = top10table["pct"].map(lambda x: f"{x:.2f}")
# horizontally concatenate 

In [37]:
print(top10table.to_latex(index=False))

\begin{tabular}{rlllrlll}
\toprule
count & pct & ticker & Name & count & pct & ticker & Name \\
\midrule
1505 & 5.55 & TSLA & Tesla Inc & 2518 & 20.58 & BTC & Bitcoin \\
825 & 3.04 & AAPL & Apple Inc & 1744 & 14.25 & ETH & Ethereum \\
709 & 2.61 & NIO & Nio Inc Class A ADR & 781 & 6.38 & ADA & Cardano \\
468 & 1.72 & AMZN & Amazon.com Inc & 390 & 3.19 & SOL & Solana \\
438 & 1.61 & META & Meta Platforms Inc. & 370 & 3.02 & DOT & Polkadot [IOU] \\
395 & 1.46 & AMC & AMC Entertainment Holdings Inc & 359 & 2.93 & SHIB & Shiba Inu \\
357 & 1.32 & MSFT & Microsoft Corporation & 314 & 2.57 & XRP & XRP \\
353 & 1.30 & PLTR & Palantir Technologies Inc & 272 & 2.22 & LINK & Chainlink \\
344 & 1.27 & BABA & Alibaba Group Holding Ltd & 255 & 2.08 & DOGE & Dogecoin \\
276 & 1.02 & DIS & Walt Disney Company & 190 & 1.55 & MATIC & Polygon \\
194 & 11.28 & SPY & SPDR S&P 500 ETF Trust & 379 & 44.17 & GC=F & Gold \\
73 & 4.24 & VTI & Vanguard Total Stock Market Index Fund ETF Shares & 301 & 35.08 & SI

### Portfolio Stats


##### General Portfolio Stats (for settings: no max pos limit, max holding period 1 month)

In [38]:
import pandas as pd
full_df = pd.read_csv("../data/portfolios/portfolio_stats.csv", sep=";")
stats = {}
for run_name in ["equal_weight_SPY_hp21_wait1_pos99999", "equal_weight_SPY_hp252_wait1_pos99999"]:
    df = full_df[full_df["run_name"] == run_name]
    # compute stats
    stats[run_name] = {
        "Mean # buy-trades": df["n_buys"].mean(),
        "Median # buy-trades": df["n_buys"].median(),
        "Mean # unique assets in portfolio": df["n_unique_positions"].mean(),
        "Median # unique assets in portfolio": df["n_unique_positions"].median(),
        "Mean % of buys - stocks": df["n_buys_stocks"].sum() / df["n_buys"].sum() * 100,
        "Mean % of buys - cryptos": df["n_buys_cryptos"].sum() / df["n_buys"].sum() * 100,
        "Mean % of buys - etfs": df["n_buys_etfs"].sum() / df["n_buys"].sum() * 100,
        "Mean % of buys - commodities": df["n_buys_commodities"].sum() / df["n_buys"].sum() * 100,
        "% portfolios with at least 1 buy in stocks": df[df["n_buys_stocks"] > 0].shape[0] / df.shape[0] * 100,
        "% portfolios with at least 1 buy in cryptos": df[df["n_buys_cryptos"] > 0].shape[0] / df.shape[0] * 100,
        "% portfolios with at least 1 buy in etfs": df[df["n_buys_etfs"] > 0].shape[0] / df.shape[0] * 100,
        "% portfolios with at least 1 buy in commodities": df[df["n_buys_commodities"] > 0].shape[0] / df.shape[0] * 100, 
    }
# print as table with two columns (both run names)
stats_df = pd.DataFrame(stats)
print(stats_df.shape)
# print to latex with two decimals
print(stats_df.to_latex(float_format="%.2f"))

(12, 2)
\begin{tabular}{lrr}
\toprule
 & equal_weight_SPY_hp21_wait1_pos99999 & equal_weight_SPY_hp252_wait1_pos99999 \\
\midrule
Mean # buy-trades & 105.55 & 64.45 \\
Median # buy-trades & 32.00 & 24.00 \\
Mean # unique assets in portfolio & 52.61 & 52.61 \\
Median # unique assets in portfolio & 21.00 & 21.00 \\
Mean % of buys - stocks & 70.24 & 72.07 \\
Mean % of buys - cryptos & 21.85 & 19.21 \\
Mean % of buys - etfs & 5.78 & 6.99 \\
Mean % of buys - commodities & 2.13 & 1.74 \\
% portfolios with at least 1 buy in stocks & 81.52 & 81.52 \\
% portfolios with at least 1 buy in cryptos & 65.88 & 65.88 \\
% portfolios with at least 1 buy in etfs & 54.50 & 54.50 \\
% portfolios with at least 1 buy in commodities & 28.91 & 28.91 \\
\bottomrule
\end{tabular}



Returns, Excess Returns, Excess Returns During Active Period (quantiles)

In [23]:
import pandas as pd
# load (non-aggregated) data
df = pd.read_csv("../data/portfolios/portfolio_stats.csv", sep=";")
# rename parameters for display
df["max_positions"] = df["max_positions"].map({99999: "\infty", 5: "5"})
df["max_holding_period"] = df["max_holding_period"].map({21: "1m", 126: "6m", 252: "1yr", 99999: "\infty"})
df["neutral_asset"] = df["neutral_asset"].map({"cash": "Cash", "3m_tbills": "T-bills", "SPY": "SPY"})
# group and aggregate
grouped = df.groupby(["max_positions", "max_holding_period", "neutral_asset"]).agg({
    "total_return_full": [
        ('q33', lambda x: x.quantile(0.33) * 100),
        ('q50', lambda x: x.quantile(0.50) * 100),
        ('q66', lambda x: x.quantile(0.66) * 100),
        ('share_positive', lambda x: (x > 0).mean() * 100)
    ],
    "total_excess_return_full": [
        ('q33', lambda x: x.quantile(0.33) * 100),
        ('q50', lambda x: x.quantile(0.50) * 100),
        ('q66', lambda x: x.quantile(0.66) * 100),
        ('share_positive', lambda x: (x > 0).mean() * 100)
    ],
    "total_excess_return_active_days": [
        ('q33', lambda x: x.quantile(0.33) * 100),
        ('q50', lambda x: x.quantile(0.50) * 100),
        ('q66', lambda x: x.quantile(0.66) * 100),
        ('share_positive', lambda x: (x > 0).mean() * 100)
    ],
})
# fix colnames
grouped.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in grouped.columns.values]
grouped = grouped.reset_index()
# change column order
grouped = grouped[["neutral_asset", "max_positions", "max_holding_period", *grouped.columns[3:]]]
# sort
# custom order for neutral asset: cash < t-bills < SPY
grouped["neutral_asset"] = pd.Categorical(grouped["neutral_asset"], ["SPY", "T-bills", "Cash"])
# custom order for max_holdings_period: 1m < 6m < 1yr
grouped["max_holding_period"] = pd.Categorical(grouped["max_holding_period"], ["1m", "6m", "1yr", "\infty"])
grouped = grouped.sort_values(["neutral_asset", "max_positions", "max_holding_period"])

In [24]:
grouped

Unnamed: 0,neutral_asset,max_positions,max_holding_period,total_return_full_q33,total_return_full_q50,total_return_full_q66,total_return_full_share_positive,total_excess_return_full_q33,total_excess_return_full_q50,total_excess_return_full_q66,total_excess_return_full_share_positive,total_excess_return_active_days_q33,total_excess_return_active_days_q50,total_excess_return_active_days_q66,total_excess_return_active_days_share_positive
1,SPY,5,1m,120.901627,157.536777,190.129849,91.469194,-51.354169,-14.719019,17.874052,41.706161,-22.082455,-5.376556,8.889275,41.706161
7,SPY,5,6m,84.335172,128.556344,175.85948,89.57346,-87.920625,-43.699452,3.603684,35.545024,-36.996043,-18.976467,1.365701,35.545024
4,SPY,5,1yr,82.22538,123.157857,177.354528,89.099526,-90.030417,-49.097939,5.098731,36.966825,-45.452493,-22.483707,2.703024,36.966825
10,SPY,5,\infty,88.048085,144.14996,180.140387,90.521327,-84.207711,-28.105836,7.88459,37.440758,-51.752812,-17.443141,4.267086,37.440758
13,SPY,\infty,1m,127.459387,159.488554,198.156582,92.417062,-44.79641,-12.767242,25.900785,43.127962,-19.303218,-5.154816,11.188093,43.127962
19,SPY,\infty,6m,98.056645,144.327331,177.678036,91.469194,-74.199151,-27.928466,5.422239,36.492891,-32.004949,-11.950576,2.362491,36.492891
16,SPY,\infty,1yr,89.273953,134.506339,192.378611,90.047393,-82.981844,-37.749458,20.122814,39.810427,-37.550043,-16.427911,9.961757,39.810427
22,SPY,\infty,\infty,117.690564,154.292655,193.543855,94.78673,-54.565233,-17.963141,21.288058,42.180095,-35.884112,-10.913184,13.312045,42.180095
2,T-bills,5,1m,0.103551,20.105207,43.830464,67.298578,-172.152246,-152.15059,-128.425333,11.374408,-22.082455,-5.376556,8.889275,41.706161
8,T-bills,5,6m,-9.432071,19.884209,49.761997,60.663507,-181.687867,-152.371587,-122.4938,9.478673,-36.996043,-18.976467,1.365701,35.545024


In [25]:
print(grouped.to_latex(index=False, float_format="%.1f"))

# smaller subtable for results section: only show excess returns during active days columns, neutral asset is irrelevant here

\begin{tabular}{lllrrrrrrrrrrrr}
\toprule
neutral_asset & max_positions & max_holding_period & total_return_full_q33 & total_return_full_q50 & total_return_full_q66 & total_return_full_share_positive & total_excess_return_full_q33 & total_excess_return_full_q50 & total_excess_return_full_q66 & total_excess_return_full_share_positive & total_excess_return_active_days_q33 & total_excess_return_active_days_q50 & total_excess_return_active_days_q66 & total_excess_return_active_days_share_positive \\
\midrule
SPY & 5 & 1m & 120.9 & 157.5 & 190.1 & 91.5 & -51.4 & -14.7 & 17.9 & 41.7 & -22.1 & -5.4 & 8.9 & 41.7 \\
SPY & 5 & 6m & 84.3 & 128.6 & 175.9 & 89.6 & -87.9 & -43.7 & 3.6 & 35.5 & -37.0 & -19.0 & 1.4 & 35.5 \\
SPY & 5 & 1yr & 82.2 & 123.2 & 177.4 & 89.1 & -90.0 & -49.1 & 5.1 & 37.0 & -45.5 & -22.5 & 2.7 & 37.0 \\
SPY & 5 & \infty & 88.0 & 144.1 & 180.1 & 90.5 & -84.2 & -28.1 & 7.9 & 37.4 & -51.8 & -17.4 & 4.3 & 37.4 \\
SPY & \infty & 1m & 127.5 & 159.5 & 198.2 & 92.4 & -44.8 & -12.8 & 2

In [26]:
df = pd.read_csv("../data/portfolios/portfolio_stats.csv", sep=";")
df.columns

Index(['channel_id', 'run_name', 'portfolio_type', 'max_positions',
       'max_holding_period', 'neutral_asset', 'min_days_wait_after_upload',
       'n_days_total_period', 'n_days_active_holdings', 'n_days_active_period',
       'n_buys', 'n_sells', 'n_unique_positions', 'n_buys_stocks',
       'n_buys_cryptos', 'n_buys_etfs', 'n_buys_commodities',
       'total_return_full', 'total_excess_return_full', 'sharpe_full',
       'sortino_full', 'value_at_risk_full', 'max_drawdown_full', 'beta_full',
       'total_return_active_period', 'total_excess_return_active_period',
       'sharpe_active_period', 'sortino_active_period',
       'value_at_risk_active_period', 'max_drawdown_active_period',
       'beta_active_period', 'total_return_active_days',
       'total_excess_return_active_days', 'sharpe_active_days',
       'sortino_active_days', 'value_at_risk_active_days',
       'max_drawdown_active_days', 'beta_active_days'],
      dtype='object')

Performance Metrics (Sharpe, Sortino)

In [40]:
# compute perf. and risk measures for SPY benchmark (full period)
import sys
sys.path.append("../")
from analysis import portfolio_utils as pfu
# load benchmark data
benchmark = pd.read_csv("../data/asset_data/returns/benchmarks_returns.csv", sep=";").set_index("date")
spy = benchmark["SPY"]
tbills = benchmark["3m_tbills"]
# compute metrics
spy_sharpe = pfu.sharpe_ratio(spy, tbills)
spy_sortino = pfu.sortino_ratio(spy, tbills, 0)
# for var need to drop first observation (no return for first day)
spy_var5 = pfu.value_at_risk(spy[1:], 0.05)
# get index values from spy returns
spy_mdd = pfu.max_drawdown((1 + spy).cumprod())

In [47]:
print(f"SPY benchmark performance and risk metrics:")
print(f"Sharpe ratio: {spy_sharpe:.3f}")
print(f"Sortino ratio: {spy_sortino:.3f}")
print(f"5% VaR: {spy_var5:.3f}")
print(f"Max drawdown: {spy_mdd:.3f}")

SPY benchmark performance and risk metrics:
Sharpe ratio: 0.044
Sortino ratio: 0.067
5% VaR: 0.018
Max drawdown: 0.337


In [42]:
import pandas as pd
# load (non-aggregated) data
df = pd.read_csv("../data/portfolios/portfolio_stats.csv", sep=";")

# rename parameters for display
df["max_positions"] = df["max_positions"].map({99999: "$\infty$", 5: "5"})
df["max_holding_period"] = df["max_holding_period"].map({21: "1m", 126: "6m", 252: "1yr", 99999: "$\infty$"})
df["neutral_asset"] = df["neutral_asset"].map({"cash": "Cash", "3m_tbills": "T-bills", "SPY": "SPY"})
# group and aggregate
grouped = df.groupby(["max_positions", "max_holding_period", "neutral_asset"]).agg({
    "sharpe_full": [
        ('mean', lambda x: x.mean()),
        ('median', lambda x: x.median()),
        ('share_beating_spy', lambda x: (x > spy_sharpe).mean() * 100)
    ],
    "sortino_full": [
        ('mean', lambda x: x.mean()),
        ('median', lambda x: x.median()),
        ('share_beating_spy', lambda x: (x > spy_sortino).mean() * 100)
    ],
    "sharpe_active_days": [
        ('mean', lambda x: x.mean()),
        ('median', lambda x: x.median()),
    ],
    "sortino_active_days": [
        ('mean', lambda x: x.mean()),
        ('median', lambda x: x.median()),
    ],
})

# fix colnames
grouped.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in grouped.columns.values]
grouped = grouped.reset_index()
# change column order
grouped = grouped[["neutral_asset", "max_positions", "max_holding_period", *grouped.columns[3:]]]
# sort
# custom order for neutral asset: cash < t-bills < SPY
grouped["neutral_asset"] = pd.Categorical(grouped["neutral_asset"], ["SPY", "T-bills", "Cash"])
# custom order for max_holdings_period: 1m < 6m < 1yr
grouped["max_holding_period"] = pd.Categorical(grouped["max_holding_period"], ["1m", "6m", "1yr", "$\infty$"])
grouped = grouped.sort_values(["neutral_asset", "max_positions", "max_holding_period"])

In [46]:
print(grouped.to_latex(index=False, float_format="%.3f"))

\begin{tabular}{lllrrrrrrrrrr}
\toprule
neutral_asset & max_positions & max_holding_period & sharpe_full_mean & sharpe_full_median & sharpe_full_share_beating_spy & sortino_full_mean & sortino_full_median & sortino_full_share_beating_spy & sharpe_active_days_mean & sharpe_active_days_median & sortino_active_days_mean & sortino_active_days_median \\
\midrule
SPY & 5 & 1m & 0.033 & 0.036 & 25.592 & 0.055 & 0.059 & 32.227 & 0.024 & 0.031 & 0.065 & 0.052 \\
SPY & 5 & 6m & 0.029 & 0.031 & 17.536 & 0.047 & 0.049 & 24.171 & 0.018 & 0.021 & 0.034 & 0.034 \\
SPY & 5 & 1yr & 0.028 & 0.029 & 16.588 & 0.046 & 0.047 & 21.327 & 0.019 & 0.020 & 0.035 & 0.035 \\
SPY & 5 & \infty & 0.030 & 0.030 & 16.588 & 0.049 & 0.049 & 22.275 & 0.024 & 0.025 & 0.043 & 0.043 \\
SPY & \infty & 1m & 0.035 & 0.038 & 27.014 & 0.057 & 0.059 & 33.175 & 0.027 & 0.036 & 0.070 & 0.059 \\
SPY & \infty & 6m & 0.031 & 0.032 & 21.327 & 0.050 & 0.051 & 25.118 & 0.021 & 0.026 & 0.038 & 0.041 \\
SPY & \infty & 1yr & 0.031 & 0.032 & 

Risk Metrics (Var, MDD, Beta)

In [62]:
import pandas as pd
# load (non-aggregated) data
df = pd.read_csv("../data/portfolios/portfolio_stats.csv", sep=";")

period = "active_days" # or "active_days"

# rename parameters for display
df["max_positions"] = df["max_positions"].map({99999: "$\infty$", 5: "5"})
df["max_holding_period"] = df["max_holding_period"].map({21: "1m", 126: "6m", 252: "1yr", 99999: "$\infty$"})
df["neutral_asset"] = df["neutral_asset"].map({"cash": "Cash", "3m_tbills": "T-bills", "SPY": "SPY"})
# group and aggregate
grouped = df.groupby(["max_positions", "max_holding_period", "neutral_asset"]).agg({
    f"value_at_risk_{period}": [
        ('mean', lambda x: x.mean() * 100),
        ('q10', lambda x: x.quantile(0.10) * 100),
        ('q33', lambda x: x.quantile(0.33) * 100),
        ('q50', lambda x: x.quantile(0.50) * 100),
        ('q66', lambda x: x.quantile(0.66) * 100),
        ('q90', lambda x: x.quantile(0.90) * 100),
        ('share_beating_spy', lambda x: "---") #(x < spy_var5).mean() * 100)
    ],
    f"max_drawdown_{period}": [
        ('mean', lambda x: x.mean() * 100),
        ('q10', lambda x: x.quantile(0.10) * 100),
        ('q33', lambda x: x.quantile(0.33) * 100),
        ('q50', lambda x: x.quantile(0.50) * 100),
        ('q66', lambda x: x.quantile(0.66) * 100),
        ('q90', lambda x: x.quantile(0.90) * 100),
        ('share_beating_spy', lambda x: "---") #(x < spy_mdd).mean() * 100)
    ],
})

# fix colnames
grouped.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in grouped.columns.values]
grouped = grouped.reset_index()
# change column order
grouped = grouped[["neutral_asset", "max_positions", "max_holding_period", *grouped.columns[3:]]]
# sort
# custom order for neutral asset: cash < t-bills < SPY
grouped["neutral_asset"] = pd.Categorical(grouped["neutral_asset"], ["SPY", "T-bills", "Cash"])
# custom order for max_holdings_period: 1m < 6m < 1yr
grouped["max_holding_period"] = pd.Categorical(grouped["max_holding_period"], ["1m", "6m", "1yr", "$\infty$"])
grouped = grouped.sort_values(["neutral_asset", "max_positions", "max_holding_period"])

In [63]:
print(grouped.to_latex(index=False, float_format="%.2f"))

\begin{tabular}{lllrrrrrrlrrrrrrl}
\toprule
neutral_asset & max_positions & max_holding_period & value_at_risk_active_days_mean & value_at_risk_active_days_q10 & value_at_risk_active_days_q33 & value_at_risk_active_days_q50 & value_at_risk_active_days_q66 & value_at_risk_active_days_q90 & value_at_risk_active_days_share_beating_spy & max_drawdown_active_days_mean & max_drawdown_active_days_q10 & max_drawdown_active_days_q33 & max_drawdown_active_days_q50 & max_drawdown_active_days_q66 & max_drawdown_active_days_q90 & max_drawdown_active_days_share_beating_spy \\
\midrule
SPY & $\infty$ & 1m & 5.70 & 2.08 & 3.80 & 4.67 & 6.43 & 10.30 & --- & 42.39 & 11.15 & 27.76 & 42.05 & 53.81 & 77.02 & --- \\
SPY & $\infty$ & 6m & 5.04 & 2.21 & 3.29 & 4.01 & 5.53 & 9.06 & --- & 55.00 & 22.24 & 41.32 & 54.35 & 66.96 & 87.90 & --- \\
SPY & $\infty$ & 1yr & 4.67 & 2.20 & 3.13 & 3.84 & 5.20 & 8.34 & --- & 57.13 & 25.88 & 44.20 & 56.64 & 68.31 & 88.05 & --- \\
SPY & $\infty$ & $\infty$ & 4.06 & 1.90 & 2.5

In [53]:
grouped

Unnamed: 0,neutral_asset,max_positions,max_holding_period,value_at_risk_full_mean,value_at_risk_full_q10,value_at_risk_full_q33,value_at_risk_full_q50,value_at_risk_full_q66,value_at_risk_full_q90,value_at_risk_full_share_beating_spy,max_drawdown_full_mean,max_drawdown_full_q10,max_drawdown_full_q33,max_drawdown_full_q50,max_drawdown_full_q66,max_drawdown_full_q90,max_drawdown_full_share_beating_spy
4,SPY,$\infty$,1m,2.265897,1.767529,1.847233,2.07122,2.225051,2.992924,7.109005,49.898488,33.717374,33.923004,46.193325,54.288655,77.620244,5.21327
10,SPY,$\infty$,6m,2.687196,1.797736,2.087089,2.348834,2.69186,4.049699,8.056872,58.03311,33.717374,43.068815,54.638858,68.120346,88.545682,2.843602
7,SPY,$\infty$,1yr,2.886521,1.797736,2.186592,2.509318,2.91335,4.520157,6.635071,59.25324,33.717374,46.091129,57.935959,68.757947,88.772607,2.843602
1,SPY,$\infty$,$\infty$,2.998495,1.764121,2.178602,2.513563,3.04029,4.926952,9.952607,57.490041,33.717374,42.167166,52.010996,67.563026,87.121481,3.317536
16,SPY,5,1m,2.31224,1.767529,1.847251,2.085575,2.279688,3.033272,6.635071,50.756615,33.717374,34.253495,46.984883,56.556134,75.723963,5.21327
22,SPY,5,6m,2.829425,1.842016,2.131049,2.443137,2.956465,4.297351,5.687204,59.54406,33.717374,47.162057,57.618144,70.467116,89.678129,3.791469
19,SPY,5,1yr,3.082683,1.849307,2.270704,2.678604,3.333251,4.727463,5.21327,61.712061,33.717374,49.617663,61.571781,71.458122,91.069962,3.791469
13,SPY,5,$\infty$,3.367317,1.922433,2.424521,3.022811,3.716035,5.251592,5.21327,63.674246,33.717374,51.231142,65.538688,76.441637,90.522647,2.369668
5,T-bills,$\infty$,1m,0.781055,-8.2e-05,-4.1e-05,0.05966877,0.864227,2.101873,84.834123,42.336861,11.771764,27.470188,41.962536,53.471498,76.754761,39.336493
11,T-bills,$\infty$,6m,1.821967,-1.4e-05,1.068694,1.623742,2.177457,3.796965,53.554502,54.985324,22.237226,41.316983,54.348676,66.95771,88.545682,19.905213
