# Case Study 1

Test wether the number of nonrandom eigenvalues are a good proxy for the number of independent trials.

For each experiment the number of nonrandom eigenvalues should be equal to or greater than the number of independent trials.

The runtime should decrease when the max_scan paramter is added.

In [20]:
# Imports

import time 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from marchenko_pastur import Signal

## Experiment 1: Randomly generated block correlation data

100 trials with 10 and 20 independent trials

250 trials with 10 and 25 independent trials

500 trials with 10 and 50 independent trials
 
1000 trials with 10 and 20 independent trials

### 100 trials with 10 and 20 independent trials

In [22]:
data_100_10 = pd.read_csv('Data/simulated_data_100_10p.csv')

In [23]:
model_100_10 = Signal(data_100_10)

eMax, var, nFacts0 = model_100_10.findMaxEval(data_100_10)

print(nFacts0)

10


In [24]:
data_100_20 = pd.read_csv('Data/simulated_data_100_20p.csv')

In [25]:
model_100_20 = Signal(data_100_20)

eMax, var, nFacts0 = model_100_20.findMaxEval(data_100_20)

print(nFacts0)

20


### 250 trials with 10 and 25 independent trials

In [26]:
data_250_10 = pd.read_csv('Data/simulated_data_250_10p.csv')

In [27]:
model_250_10 = Signal(data_250_10)

eMax, var, nFacts0 = model_250_10.findMaxEval(data_250_10)

print(nFacts0)

10


In [28]:
data_250_25 = pd.read_csv('Data/simulated_data_250_25.csv')

In [29]:
model_250_25 = Signal(data_250_25)

eMax, var, nFacts0 = model_250_25.findMaxEval(data_250_25)

print(nFacts0)

25


### 500 trials with 10 and 50 independent trials

In [30]:
data_500_10 = pd.read_csv('Data/simulated_data_500_10p.csv')

In [31]:
model_500_10 = Signal(data_500_10)

eMax, var, nFacts0 = model_500_10.findMaxEval(data_500_10)

print(nFacts0)

10


In [32]:
data_500_50 = pd.read_csv('Data/simulated_data_500_50p.csv')

In [33]:
model_500_50 = Signal(data_500_50)

eMax, var, nFacts0 = model_500_50.findMaxEval(data_500_50)

print(nFacts0)

50


### 1000 trials with 10 and 20 independent trials

In [34]:
data_1000_10 = pd.read_csv('Data/simulated_data_1000_10.csv')

In [35]:
model_1000_10 = Signal(data_1000_10)

eMax, var, nFacts0 = model_1000_10.findMaxEval(data_1000_10)

print(nFacts0)

10


In [36]:
data_1000_20 = pd.read_csv('Data/simulated_data_1000_20.csv')

In [37]:
model_1000_20 = Signal(data_1000_20)

eMax, var, nFacts0 = model_1000_20.findMaxEval(data_1000_20)

print(nFacts0)

20


## Experiment 2: S&P500 data

There should be a difference in runtime when using the max_scan parameter as part of the ONC algorithm.

In [21]:
data = pd.read_csv('Data/real_strategy_data.csv', index_col=0, parse_dates=True)
data = data.fillna(0)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1030,1031,1032,1033,1034,1035,1036,1037,1038,1039
2013-03-18,-0.009287,-0.009287,-0.009287,-0.009287,-0.009287,-0.009287,-0.009287,-0.009287,-0.009287,-0.009287,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
2013-03-19,0.000000,-0.002909,-0.002909,-0.002909,0.000000,-0.002909,-0.002909,-0.002909,0.000000,-0.002909,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
2013-03-20,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
2013-03-21,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
2013-03-22,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-16,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,-0.014068,-0.014068,0.0,0.0,-0.014068,-0.014068
2023-02-17,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,-0.002988,-0.002988,0.0,0.0,-0.002988,-0.002988
2023-02-21,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,-0.020000,0.0,0.0,0.000000,-0.020000
2023-02-22,-0.001685,-0.001685,-0.001685,-0.001685,-0.001685,-0.001685,-0.001685,-0.001685,-0.001685,-0.001685,...,0.0,0.0,0.0,0.0,0.000000,-0.001685,0.0,0.0,0.000000,-0.001685


### Determine the minimum number of clusters to consider:

In [3]:
model_real_data = Signal(data)

eMax, var, nFacts0 = model_real_data.findMaxEval(data)

print(nFacts0)

41


### Run ONC with max_scan parameter:

In [4]:
from onc_fast import ONC

We add the input max_scan to incorporate the new information from specral decomposition

In [5]:
import os
os.environ["OMP_NUM_THREADS"] = '16'
# Supress all warnings due to running Kmeans on windows using all cores
import warnings
warnings.filterwarnings("ignore")
import time

# Fit ONC
start_time = time.time()
# set repeats as large as you can afford
model = ONC(data.corr(), repeat=450, max_scan=41)
print('Fitting')
model.fit()
end_time = time.time()
print("Time taken: ", end_time - start_time)

# Load from saved
# model2 = ONC(data.corr(), repeat=1)
# model2.load_results(pre_fix='official3')

Fitting
Max_scan: 6
5
improving now
Max_scan: 6
3
Time taken:  24451.688906431198


### The silhouette scores

In [6]:
model.get_silh_scores()

0       0.251217
1       0.282623
2       0.281864
3       0.281864
4       0.251217
          ...   
1035    0.345230
1036    0.215230
1037    0.257312
1038    0.288098
1039    0.341168
Length: 1040, dtype: float64

### Check the sorted block correlation matrix

In [7]:
model.get_block_correlation()

Unnamed: 0,560,561,562,563,564,568,572,576,580,584,...,406,407,409,410,411,413,417,421,422,423
560,1.000000,0.642347,0.627133,0.627133,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.117602,0.115987,0.127690,0.116094,0.112681,0.117421,0.147476,0.134558,0.125899,0.125803
561,0.642347,1.000000,0.976286,0.976286,0.642347,0.642347,0.642347,0.642347,0.642347,0.642347,...,0.294141,0.290108,0.282731,0.285459,0.277075,0.259991,0.274889,0.284120,0.282589,0.282374
562,0.627133,0.976286,1.000000,1.000000,0.627133,0.627133,0.627133,0.627133,0.627133,0.627133,...,0.303473,0.299299,0.279607,0.293736,0.285091,0.257162,0.271633,0.285065,0.297832,0.297603
563,0.627133,0.976286,1.000000,1.000000,0.627133,0.627133,0.627133,0.627133,0.627133,0.627133,...,0.303473,0.299299,0.279607,0.293736,0.285091,0.257162,0.271633,0.285065,0.297832,0.297603
564,1.000000,0.642347,0.627133,0.627133,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.117602,0.115987,0.127690,0.116094,0.112681,0.117421,0.147476,0.134558,0.125899,0.125803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0.117421,0.259991,0.257162,0.257162,0.117421,0.117421,0.117421,0.117421,0.117421,0.117421,...,0.809902,0.798909,0.919231,0.836249,0.811854,1.000000,0.976046,0.547205,0.512058,0.511681
417,0.147476,0.274889,0.271633,0.271633,0.147476,0.147476,0.147476,0.147476,0.147476,0.147476,...,0.790516,0.779789,0.897221,0.816236,0.792429,0.976046,1.000000,0.534110,0.499806,0.499439
421,0.134558,0.284120,0.285065,0.285065,0.134558,0.134558,0.134558,0.134558,0.134558,0.134558,...,0.585479,0.577498,0.595224,0.541360,0.525518,0.547205,0.534110,1.000000,0.935777,0.935089
422,0.125899,0.282589,0.297832,0.297832,0.125899,0.125899,0.125899,0.125899,0.125899,0.125899,...,0.582963,0.575010,0.556964,0.539020,0.523237,0.512058,0.499806,0.935777,1.000000,0.999264


### Check the cluster statistics

The independet trials should each be evaluated according to the Deflated Sharpe Ratio(DSR).

In [8]:
from sbumt import (weights_min_var, expected_max_sr, 
                   probabilistic_sharpe_ratio, deflated_sharpe_ratio, cluster_statistics)

In [9]:
# Get the clusters
clusters_dict = model.get_optimal_clusters()
print(clusters_dict)

sil_s = model.get_silh_scores()

# Create Table
table_statistics = cluster_statistics(clusters_dict, data)
table_statistics

{0: ['560', '561', '562', '563', '564', '568', '572', '576', '580', '584', '588', '592', '596', '600', '604', '608', '612', '616', '620', '624', '628', '632', '636', '640', '641', '642', '643', '644', '648', '652', '656', '660', '664', '668', '672', '676', '680', '684', '688', '692', '696', '700', '704', '708', '712', '716', '800', '801', '802', '803', '804', '808', '812', '816', '820', '824', '828', '832', '836', '840', '844', '848', '852', '856', '860', '864', '868', '872', '876', '880', '881', '882', '883', '884', '888', '892', '896', '900', '904', '908', '912', '916', '920', '924', '928', '932', '936', '940', '944', '948', '952', '956', '960', '961', '962', '963', '964', '968', '972', '976', '980', '984', '988', '992', '996', '1000', '1004', '1008', '1012', '1016', '1020', '1024', '1028', '1032', '1036'], 1: ['160', '161', '162', '163', '164', '168', '172', '176', '180', '184', '188', '192', '196', '200', '204', '208', '212', '216', '220', '224', '228', '232', '236', '240', '241', 

Unnamed: 0,Strat Count,aSR,SR,Skew,Kurt,T,sqrt(V[SR_k]),E[max SR_k],DFS
0,115.0,0.548064,0.034525,1.2592,70.389292,2412.0,0.019148,0.020146,0.762449
1,118.0,0.503583,0.031723,-3.233266,79.342056,2412.0,0.019148,0.020146,0.704218
2,702.0,0.883966,0.055685,-0.366252,7.770289,2412.0,0.019148,0.020146,0.95756
3,105.0,0.140971,0.00888,-3.493794,58.262329,2412.0,0.019148,0.020146,0.293052


## Run ONC without the max_scan parameter

Here we should see 5 independent trials found but athe runtime is much larger.

In [11]:
from onc import ONC_no_max_scan

In [None]:
import os
os.environ["OMP_NUM_THREADS"] = '16'
# Supress all warnings due to running Kmeans on windows using all cores
import warnings
warnings.filterwarnings("ignore")
import time

# Fit ONC
start_time = time.time()
model4 = ONC_no_max_scan(data.corr(), repeat=450)
print('Fitting')
model4.fit()
end_time = time.time()
print("Time taken: ", end_time - start_time)

# Load from saved
# model2 = ONC(data.corr(), repeat=1)
# model2.load_results(pre_fix='official3')

# 