In [1]:
import pandas as pd
import rpy2 as rpy2
import rpy2.robjects as robjects

from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from Helpers.FunctHelpers import *
from Helpers.PlotHelpers import *

In [2]:
# Download data

tickers = pd.read_excel('Data/SandP500list.xlsx')
tickers = tickers[tickers['sector'] == 'Financials']
names = tickers['symbol'].tolist()

start_date = pd.to_datetime('2022-01-01') # train 12 months
end_date = pd.to_datetime('2023-07-01') # test 6 months

data = download_data(names, start_date, end_date)

[*********************100%***********************]  65 of 65 completed


6 Failed downloads:
['RE', 'BRK.B', 'WLTW', 'FRC', 'PBCT']: Exception('%ticker%: No timezone found, symbol may be delisted')
['SIVB']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")





In [3]:
# Split into train and test data

train_start = pd.to_datetime('2022-01-01')
train_end = pd.to_datetime('2022-12-31')
test_start = pd.to_datetime('2023-01-01')
test_end = pd.to_datetime('2023-07-01')

train_data, test_data = train_test_split(data, train_start, train_end, test_start, test_end)
print(train_data.shape, test_data.shape)

(251, 59) (124, 59)


In [4]:
print(train_data.head())    

                           AFL        AIG         AIZ         AJG         ALL  \
Date                                                                            
2022-01-03 00:00:00  55.144173  54.488396  149.792618  162.668793  111.111961   
2022-01-04 00:00:00  56.951717  56.016994  151.201721  163.371338  113.798851   
2022-01-05 00:00:00  56.857086  55.247948  149.830963  160.307465  113.695503   
2022-01-06 00:00:00  57.907539  56.197392  149.370880  162.054062  114.992004   
2022-01-07 00:00:00  58.825500  57.669022  152.160233  161.351562  118.327141   

                            AMP         AON         AXP        BAC        BEN  \
Date                                                                            
2022-01-03 00:00:00  293.325745  288.268799  162.605362  43.460335  30.134270   
2022-01-04 00:00:00  306.649139  289.281250  167.825394  45.163734  30.864849   
2022-01-05 00:00:00  303.621948  288.032806  166.037048  44.401436  30.540146   
2022-01-06 00:00:00  307.72

In [5]:
# Calculate SSD and select 20 pairs with lowest SSD

train_data_cum = train_data.pct_change().cumsum().dropna(axis=0)

ssd_train = find_ssd(train_data_cum)
pairs = select_lowest_ssd(ssd_train, train_data_cum)

print(f'Choosen pairs: {len(pairs)} from {round(len(data.columns) * (len(data.columns) - 1) / 2)} posssible pairs')

Choosen pairs: 20 from 1711 posssible pairs


In [6]:
#plot_min_ssd_pairs(pairs, train_data_cum)

In [7]:
import pandas as pd
import numpy as np
from scipy.stats import norm, logistic, genextreme

def fit_distributions_and_calculate_aic(data, pairs):
    """
    Fits distributions to stock returns and calculates AIC values for each distribution for each stock in each pair.
    """
    # Calculate daily returns from cumulative returns
    returns = data.pct_change().dropna(axis=0)

    def fit_distribution(rdata, dist_name):
        dist = {'norm': norm, 'logistic': logistic, 'genextreme': genextreme}[dist_name]
        params = dist.fit(rdata)
        nll = -np.sum(dist.logpdf(rdata, *params))
        aic = 2 * len(params) + 2 * nll
        return params, aic

    results = {}
    for stock1, stock2 in pairs:
        results[stock1] = {}
        results[stock2] = {}
        for stock in [stock1, stock2]:
            if stock in returns.columns:
                results[stock] = {}
                for dist_name in ['norm', 'logistic', 'genextreme']:
                    params, aic = fit_distribution(returns[stock], dist_name)
                    results[stock][dist_name] = {'params': params, 'aic': aic}
    return results

# Example stock data and pairs
# Run the fitting function
fit_results = fit_distributions_and_calculate_aic(train_data, pairs)

In [8]:
def transform_returns_to_uniform(data, gev_params):
    """
    Transforms returns to uniform distributions using the GEV distribution CDF.
    """
    returns = data.pct_change().dropna()
    uniform_data = {stock: genextreme.cdf(returns[stock], *params) for stock, params in gev_params.items() if stock in returns}
    return pd.DataFrame(uniform_data)


# Correctly extract GEV parameters
gev_params = {}
for stock, dist_results in fit_results.items():
    if 'genextreme' in dist_results:
        gev_params[stock] = dist_results['genextreme']['params']

uniform_data = transform_returns_to_uniform(train_data, gev_params)
print(uniform_data.head())


        MCO      SPGI      NTRS       USB      FITB       KEY       AFL  \
0  0.493615  0.329498  0.945461  0.533142  0.963320  0.981927  0.976153   
1  0.085225  0.228671  0.404501  0.350469  0.473327  0.427856  0.539379   
2  0.285101  0.236342  0.710681  0.541133  0.960683  0.979205  0.893467   
3  0.292721  0.188646  0.891882  0.460304  0.664538  0.766843  0.865622   
4  0.054466  0.048627  0.464746  0.403473  0.523585  0.517857  0.816139   

        MET       AIG       HIG  ...       TFC         C        GL       ALL  \
0  0.945632  0.889873  0.855227  ...  0.868540  0.699432  0.566059  0.573407   
1  0.410549  0.257497  0.334981  ...  0.209989  0.331805  0.365096  0.367995   
2  0.952345  0.777262  0.815955  ...  0.910285  0.927731  0.488620  0.451736   
3  0.778425  0.874234  0.663914  ...  0.661924  0.775473  0.484266  0.634288   
4  0.607462  0.690205  0.416447  ...  0.464367  0.635974  0.378969  0.279207   

        TRV       AON       MMC        CB       BAC        BK  
0  0

In [13]:
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
import pandas as pd
import os

def fit_copulas_for_pairs(data, pairs):
    pandas2ri.activate()
    copula = importr('copula')

    results = {}

    ro.r('options(warn=-1)')  # Suppress R warnings
    ro.r('sink("/dev/null")')  # Suppress all R output

    try:
        for stock1, stock2 in pairs:
            if stock1 in data.columns and stock2 in data.columns:
                df_pair = data[[stock1, stock2]].dropna()
                if df_pair.empty:
                    continue

                r_dataframe = pandas2ri.py2rpy(df_pair)
                ro.globalenv['df'] = r_dataframe

                r_script = """
                library(copula)
                tryCatch({
                    gumbelCop = gumbelCopula(dim = 2, dispstr = "un")
                    claytonCop = claytonCopula(dim = 2, dispstr = "un")
                    normalCop = normalCopula(dim = 2, dispstr = "un")
                    fit_gumbel = suppressWarnings(fitCopula(gumbelCop, data=df, method="ml"))
                    fit_clayton = suppressWarnings(fitCopula(claytonCop, data=df, method="ml"))
                    fit_normal = suppressWarnings(fitCopula(normalCop, data=df, method="ml"))
                    list(gumbel = fit_gumbel, clayton = fit_clayton, normal = fit_normal)
                }, error=function(e) list(error=toString(e)))
                """
                result = ro.r(r_script)
                results[(stock1, stock2)] = result

    finally:
        ro.r('sink(NULL)')  # Close the sink
        ro.r('options(warn=0)')  # Restore default warning level

    return results


copula_results = fit_copulas_for_pairs(uniform_data, pairs)


dict_values([<rpy2.robjects.vectors.ListVector object at 0x12e2af450> [19]
R classes: ('list',)
[StrSexpVector]
  error: <class 'rpy2.rinterface_lib.sexp.StrSexpVector'>
  <rpy2.rinterface_lib.sexp.StrSexpVector object at 0x12e21ead0> [16], <rpy2.robjects.vectors.ListVector object at 0x12e2aea90> [19]
R classes: ('list',)
[StrSexpVector]
  error: <class 'rpy2.rinterface_lib.sexp.StrSexpVector'>
  <rpy2.rinterface_lib.sexp.StrSexpVector object at 0x12c33be50> [16], <rpy2.robjects.vectors.ListVector object at 0x12e2ae010> [19]
R classes: ('list',)
[StrSexpVector]
  error: <class 'rpy2.rinterface_lib.sexp.StrSexpVector'>
  <rpy2.rinterface_lib.sexp.StrSexpVector object at 0x12bd62110> [16], <rpy2.robjects.vectors.ListVector object at 0x12e2ad710> [19]
R classes: ('list',)
[StrSexpVector]
  error: <class 'rpy2.rinterface_lib.sexp.StrSexpVector'>
  <rpy2.rinterface_lib.sexp.StrSexpVector object at 0x12cb6c190> [16], <rpy2.robjects.vectors.ListVector object at 0x12e2acb90> [19]
R classes: ('