In [46]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.stats import norm
from tqdm import tqdm
from dateutil.relativedelta import relativedelta
from helper import *
from calculate_delta import *
import sys
from sklearn.covariance import LedoitWolf
import os
from drmv_riskfree import *

#autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:
sigma_real = np.load('real_data_sigma.npy')
df=pd.read_csv('df_date.csv')

In [151]:
def sim_mkt_data_highdim(T, num_paths, 
                         sigma, s0, dt=1/2520, seed=1):
    """
    使用联合离散分布，模拟高维市场数据。

    参数:
        T (float): 总模拟时间 (例如，1.0 代表一年)。
        joint_z_vectors (ndarray): 预定义的场景向量，形状为 (m, dim)。
        p_dist (ndarray): 每个场景向量对应的概率，形状为 (m,)。
        num_paths (int): 要模拟的路径数量。
        sigma (ndarray): **波动率矩阵 σ**，形状为 (dim, dim)。
        s0 (float): 初始价格。  
        dt (float): 时间步长。

    返回:
        S (ndarray): 模拟的股价路径，形状 (num_paths, N+1, dim)。
        t_list (ndarray): 时间点列表，形状 (N+1,)。
        b_vectors (ndarray): 为每条路径选择的漂移向量，形状 (num_paths, dim)。
        W (ndarray): 模拟的多维布朗运动，形状 (num_paths, N+1, dim)。
    """
    dim = sigma.shape[0]
    N = int(T / dt)  # 时间步数量
    t_list = np.linspace(0, T, N + 1)
    np.random.seed(seed)
    # --- Bt=B0*(1+np.cos(2*np.pi*rand_k*t)) /2---
    # 抽取 m 个场景的索引
    # num_scenarios = joint_z_vectors.shape[0]
    # scenario_indices = np.arange(num_scenarios)
    # chosen_indices = np.random.choice(scenario_indices, p=p_dist, size=num_paths, replace=True)


    B0=0.1
    rand_k = np.random.normal(10, 30, sigma.shape[0]) # TODO: make k larger so fluctuate weekly or bi-weekly; can change to fixed numbers rather than random
    # generate b_vectors, finally shape is (N, dim)
    b_vectors = np.zeros((N, dim))
    
    # Create meshgrid for proper broadcasting: t (N,) and rand_k (dim,)
    # We use t_list[:-1] to get N time steps (excluding the last one)
    t_mesh, rand_k_mesh = np.meshgrid(t_list[:-1], rand_k, indexing='ij')
    # Now t_mesh and rand_k_mesh both have shape (N, dim)
    b_vectors = B0*(1 + 2*np.cos(2*np.pi*rand_k_mesh*t_mesh))/2

    # --- 2. 模拟多维布朗运动 W ---
    # 生成标准正态分布的增量
    
    normal_increments = np.random.normal(loc=0.0, scale=np.sqrt(dt), size=(num_paths, N, dim))
    
    W = np.zeros((num_paths, N + 1, dim))
    # 通过对增量进行累积求和来构建布朗运动路径
    W[:, 1:, :] = np.cumsum(normal_increments, axis=1)

    # --- 3. 模拟股价路径 S ---
    S = np.zeros((num_paths, N + 1, dim))

    S[:, 0, :] = s0 * np.ones((num_paths, dim))

    for i in range(N):
        # 提取当前状态
        current_S = S[:, i, :]
        
        # 布朗运动的增量 dW
        dW = W[:, i + 1, :] - W[:, i, :]
        
        # --- 计算 SDE 的增量 dS ---
        # 漂移项: b*dt
        drift_term = b_vectors[i] * dt
        
        # 波动率项: σ * dW
        # 使用矩阵乘法 (@)，并对 sigma 进行转置以匹配批量操作的维度
        # (num_paths, dim) @ (dim, dim) -> (num_paths, dim)
        vol_term = dW @ sigma.T
        
        # 逐元素乘法计算 dS
        dS = current_S * (drift_term + vol_term)
        
        # 更新下一时间步的价格
        S[:, i + 1, :] = current_S + dS
    
    # for each path, if any negative items in S, remove this path 
    for i in range(num_paths):
        if np.any(S[i] < 0):
            S = np.delete(S, i, axis=0)
            W = np.delete(W, i, axis=0)
    
    # remove the first row of S
    S = S[:, 1:, :]
    W = W[:, 1:, :]
    b_vectors = b_vectors[1:, :]
    t_list = t_list[1:]
    
    return S, t_list, b_vectors, W

In [152]:
prices_gen, t_list_gen, b_vectors_gen, W_gen = sim_mkt_data_highdim(T=12, num_paths=100, s0=10, sigma=sigma_real/8, dt=1/252)
prices_gen.shape

(100, 3024, 20)

In [157]:
def sim_data_to_df_hour(prices, real_trade_dates, types=120):
    """
    Converts a 2D numpy array of intraday prices into two DataFrames: one for
    daily aggregated data and one for intraday data. Assumes 10 observations per day.
    Removes entire days for a stock if any intraday return is NaN to ensure consistency.

    Args:
        prices (np.ndarray): A 2D numpy array of shape (T, dim), where T is the
                             number of time periods (days * 10) and dim is the
                             number of stocks.
        real_trade_dates (list or array): A list of trade dates.
        types (int): The number of types to cycle through for the 'type' column.

    Returns:
        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing two DataFrames:
                                           - df_daily: Aggregated daily data.
                                           - df_intraday: Intraday data with an 'hour' column.
    """
    T, dim = prices.shape

    if T % 10 != 0:
        raise ValueError("The total number of time periods (T) must be a multiple of 10.")
    
    num_days = T // 10
    
    if len(real_trade_dates) < num_days:
        raise ValueError("Not enough real_trade_dates for the given price data.")
    
    daily_dates = real_trade_dates[-num_days:]
    
    permnos = range(1, dim + 1)

    # --- Create Intraday DataFrame ---
    df_prc_wide = pd.DataFrame(prices, columns=permnos)
    df_prc_wide['date'] = np.repeat(daily_dates, 10)
    df_prc_wide['hour'] = np.tile(range(1, 11), num_days)
    
    df_intraday = df_prc_wide.melt(id_vars=['date', 'hour'], value_name='prc', var_name='permno')
    
    df_intraday.sort_values(['permno', 'date', 'hour'], inplace=True)
    df_intraday.reset_index(drop=True, inplace=True)
    
    # Calculate returns. The first entry for each permno will be NaN.
    df_intraday['log_ret'] = df_intraday.groupby('permno')['prc'].transform(lambda x: np.log(x / x.shift(1)))
    df_intraday['ret'] = df_intraday.groupby('permno')['prc'].transform(pd.Series.pct_change)
    
    # --- Filter out entire days that contain any NaN returns ---
    # Identify the (permno, date) pairs that have at least one NaN value
    bad_days = df_intraday[df_intraday['log_ret'].isnull()][['permno', 'date']].drop_duplicates()
    
    if not bad_days.empty:
        # Use a merge with an indicator to perform an anti-join, keeping only rows
        # that are not in the 'bad_days' DataFrame.
        df_intraday = df_intraday.merge(bad_days, on=['permno', 'date'], how='left', indicator=True)
        df_intraday = df_intraday[df_intraday['_merge'] == 'left_only'].drop(columns=['_merge'])

    df_intraday.reset_index(drop=True, inplace=True)

    # Add remaining columns now that the data is clean
    df_intraday['type'] = (df_intraday.groupby(['date', 'hour']).ngroup() % types) + 1
    df_intraday['prc_adjusted'] = df_intraday['prc']

    # Reorder columns for the intraday dataframe
    df_intraday = df_intraday[['date', 'hour', 'permno', 'ret', 'prc', 'type', 'prc_adjusted', 'log_ret']]

    # --- Create Daily DataFrame from the cleaned intraday data ---
    daily_groups = df_intraday.groupby(['date', 'permno'])

    df_daily = daily_groups.agg(
        log_ret=('log_ret', 'sum'),
        prc=('prc', 'last')
    ).reset_index()

    # Calculate daily simple return from aggregated log return
    df_daily['ret'] = np.exp(df_daily['log_ret']) - 1
    
    # 'type' in daily data varies only with date
    df_daily['type'] = (df_daily.groupby('date').ngroup() % types) + 1
    df_daily['prc_adjusted'] = df_daily['prc']
    
    # Reorder columns for the daily dataframe
    df_daily = df_daily[['date', 'permno', 'ret', 'prc', 'type', 'prc_adjusted', 'log_ret']]

    return df_daily, df_intraday

In [158]:
def sim_data_to_df(prices, real_trade_dates, types=60):
    """
    Converts a 2D numpy array of prices into a long-format pandas DataFrame.

    Args:
        prices (np.ndarray): A 2D numpy array of shape (T, dim), where T is the
                             number of time periods and dim is the number of stocks.

    Returns:
        pd.DataFrame: A DataFrame with columns: 'date', 'permno', 'ret', and 'prc'.
                      'permno' is the stock identifier, from 1 to dim.
    """
    T, dim = prices.shape
    dates = real_trade_dates[-T:]
    permnos = range(1, dim + 1)

    # Create a wide DataFrame for prices
    df_prc = pd.DataFrame(prices, index=dates, columns=permnos)
    df_prc.index.name = 'date'
    df_prc.columns.name = 'permno'

    # Calculate returns
    df_ret = df_prc.pct_change()

    # Stack prices and returns to convert to long format
    # dropna=False is important to keep all price entries, even with NaN returns for the first day
    s_prc = df_prc.stack(dropna=False).rename('prc')
    s_ret = df_ret.stack(dropna=False).rename('ret')

    # Combine into a single DataFrame, aligning on the (date, permno) index
    df = pd.concat([s_ret, s_prc], axis=1)

    # Reset index to get 'date' and 'permno' as columns
    df = df.reset_index()
    
    # Reorder columns to the desired format
    df = df[['date', 'permno', 'ret', 'prc']]

    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    df['type'] = (df.groupby('date').ngroup() % types) + 1
    df['prc_adjusted'] = df['prc']
    df['log_ret'] = df.groupby('permno')['prc'].transform(lambda x: np.log(x / x.shift(1)))
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [159]:
sim_df = sim_data_to_df(prices_gen[0], df['date'].unique())

In [160]:
sim_df

Unnamed: 0,date,permno,ret,prc,type,prc_adjusted,log_ret
0,2012-12-28,1,-0.001855,9.961324,2,9.961324,-0.001856
1,2012-12-28,2,0.002454,10.020765,2,10.020765,0.002451
2,2012-12-28,3,0.000673,10.001678,2,10.001678,0.000672
3,2012-12-28,4,-0.000451,10.041058,2,10.041058,-0.000451
4,2012-12-28,5,0.000719,10.028686,2,10.028686,0.000719
...,...,...,...,...,...,...,...
60435,2024-12-31,16,0.001073,20.099963,23,20.099963,0.001073
60436,2024-12-31,17,-0.001238,19.517087,23,19.517087,-0.001239
60437,2024-12-31,18,-0.002310,16.583438,23,16.583438,-0.002313
60438,2024-12-31,19,-0.002434,19.503014,23,19.503014,-0.002437


In [None]:
df_day, df_intraday = sim_data_to_df_hour(prices_gen[0], df['date'].unique())

In [94]:
df_intraday

Unnamed: 0,date,hour,permno,ret,prc,type,prc_adjusted,log_ret
0,2012-12-27,1,1,0.009077,9.815373,1,9.815373,0.009036
1,2012-12-27,2,1,-0.002746,9.788424,2,9.788424,-0.002749
2,2012-12-27,3,1,0.000618,9.794475,3,9.794475,0.000618
3,2012-12-27,4,1,-0.004353,9.751844,4,9.751844,-0.004362
4,2012-12-27,5,1,0.011539,9.864367,5,9.864367,0.011473
...,...,...,...,...,...,...,...,...
604595,2024-12-31,6,20,-0.002554,14.026441,106,14.026441,-0.002557
604596,2024-12-31,7,20,-0.003372,13.979145,107,13.979145,-0.003378
604597,2024-12-31,8,20,0.009203,14.107792,108,14.107792,0.009161
604598,2024-12-31,9,20,-0.000900,14.095090,109,14.095090,-0.000901


In [None]:
def main_sim_new_hour(daily_df, intraday_df, r=0.02, seed=42, start_date='2024-01-01', end_date='2024-12-31',
                 beta=-3, num_stocks=20, plan_time=1/12, dt = 1/2520, sigma_real = sigma_real):
    # Step 1: Load data
    
    df = daily_df
    df_intra = intraday_df
    # Sort by permno and date to ensure proper ordering for log return calculation
    df = df.sort_values(['permno', 'date'])
    df['date'] = pd.to_datetime(df['date'])
    df_intra['date'] = pd.to_datetime(df_intra['date'])
    # Calculate log returns for each stock
    
    
    # Step 1: Find stocks with complete data from 2005-12-31 to 2015-01-01
    initial_start = pd.to_datetime(start_date) - relativedelta(years=10)    
    initial_end = pd.to_datetime(start_date) - pd.Timedelta(days=1)
    
    # Get stocks that have data in the initial period
    initial_period_data = df[(df['date'] >= initial_start) & (df['date'] <= initial_end)]
    
    # Count trading days in the initial period for validation
    total_trading_days = initial_period_data['date'].nunique()
    # print(f"Total trading days in initial period: {total_trading_days}")
    
    # Find stocks with sufficient data coverage (at least 80% of trading days)
    stock_coverage = initial_period_data.groupby('permno')['date'].nunique()
    min_required_days = int(total_trading_days)  # Require at least 80% coverage
    valid_stocks_initial = stock_coverage[stock_coverage >= min_required_days].index.tolist()
    
    # print(f"Stocks with sufficient data in initial period: {len(valid_stocks_initial)}")
    
    # Sample num_stocks stocks from those with complete initial data
    np.random.seed(seed)  # For reproducibility
    selected_stocks = np.sort(np.random.choice(valid_stocks_initial, num_stocks, replace=False))
    
    print(f"Initially selected stocks: {selected_stocks}")
    
    # Step 2: Process monthly data starting from 2015-01-01
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    month_starts = pd.date_range(start=start_date, end=end_date, freq='MS')
    
    current_stocks = selected_stocks.copy()
    kara_wealth_list = [1]
    drbc_wealth_list = [1]
    drmv_wealth_list = [1]
    for i, current_month in enumerate(tqdm(month_starts)):
        # print(f"\nProcessing month {i+1}/{len(month_starts)}: {current_month}")
        
        # Define time windows
        train_start = current_month - relativedelta(years=10)
        train_end = current_month - pd.Timedelta(days=1)
        test_start = current_month
        test_end = month_starts[i+1] - pd.Timedelta(days=1) if i+1 < len(month_starts) else pd.to_datetime(end_date)
        
        # print(f"Previous 10 years: {train_start.date()} to {train_end.date()}")
        # print(f"Next month: {test_start.date()} to {test_end.date()}")
        
        prev_to_next_dates = df[(df['date'] >= train_start) & (df['date'] <= test_end)]['date'].nunique()
        
        # Efficiently find all stocks with 100% coverage using vectorized operations
        # Get data for the entire period (train + test)
        full_period_start = train_start
        full_period_end = test_end
        full_period_data = df[(df['date'] >= full_period_start) & (df['date'] <= full_period_end)]
        full_period_stock_dates = full_period_data.groupby('permno')['date'].nunique()
        all_valid_stocks = full_period_stock_dates[full_period_stock_dates >= prev_to_next_dates].index.values
        
        # Check which current stocks are still valid
        valid_current_stocks = np.intersect1d(current_stocks, all_valid_stocks)
        
        # print(f"Current stocks with 100% coverage: {len(valid_current_stocks)} out of {len(current_stocks)}")
        # print(f"Total stocks available with 100% coverage: {len(all_valid_stocks)}")
        
        # If we need to replace stocks to maintain 20 stocks
        stocks_needed = num_stocks - len(valid_current_stocks)
        
        if stocks_needed > 0:
            # print(f"Need to find {stocks_needed} replacement stocks")
            
            # Find replacement candidates (exclude currently valid stocks)
            replacement_candidates = np.setdiff1d(all_valid_stocks, valid_current_stocks)
            
            # print(f"Available replacement candidates: {len(replacement_candidates)}")
            # add to 20 stocks
            stocks_to_add = np.random.choice(replacement_candidates, stocks_needed, replace=False)
            # Use all available replacements, even if less than needed
            current_stocks = np.sort(np.concatenate([valid_current_stocks, stocks_to_add]))

        else:
            current_stocks = valid_current_stocks
            # print("All current stocks are valid, no replacement needed")
        
        # print(f"Final stock selection for this month: {current_stocks}")
        # print(f"Number of stocks: {len(current_stocks)}")
        
        # Get training data for the selected stocks
        pretrain_data = df[(df['date'] >= train_start) & (df['date'] <= train_end) & 
                          (df['permno'].isin(current_stocks))]
        drmv_weights = run_single_backtest_select_stocks(
            training_data=pretrain_data,
            selected_perms=current_stocks,
            annual_target_return=0.105,
            r=r)
        length = len(pretrain_data) / len(current_stocks)
        prev_sigma_start_dt = (train_start - relativedelta(months=1))
        #prev_sigma_start_dt = (train_start - relativedelta(years=1))
        to_get_B_data_intraday = df_intra[(df_intra['date'] >= prev_sigma_start_dt) & (df_intra['date'] <= train_end) & 
                          (df_intra['permno'].isin(current_stocks))]
        matrix, n_to_average = compute_annualized_matrix_type(to_get_B_data_intraday, sigma_real, dt=dt)
        ret_matrix = pretrain_data.pivot(index='date', columns='permno', values='ret')
        ret_matrix = ret_matrix.fillna(0)
        
       
        # use real sigma matrix (already annualized)
        sigma_mat = sigma_real #np.linalg.cholesky(cov)
        curr_data = df[(df['date'] <= test_end) & (df['date'] >= current_month)&(df['permno'].isin(current_stocks))]
        curr_data_intra = df_intra[(df_intra['date'] <= test_end) & (df_intra['date'] >= current_month)&(df_intra['permno'].isin(current_stocks))]
        # dt = 1/length
        t_list = np.linspace(0, 1, int(1/dt))
        price_st = curr_data.pivot(index='date', columns='permno', values='prc_adjusted').fillna(method='ffill').values
        price_st_intra = curr_data_intra.pivot(index=['date', 'hour'], columns='permno', values='prc_adjusted').fillna(method='ffill').values
        curr_all_ret = price_st[-1] / price_st[0] - 1
        yt = St_to_Yt_vectorized(price_st_intra[np.newaxis, :, :], price_st_intra[0], sigma_mat, r, t_list[1:int(len(curr_data_intra)/num_stocks)+1]) # can be t_list[0:len(curr_data)]
        k= solve_k_with_EL(matrix, r=r, sigma=sigma_mat, T=plan_time, beta=beta, num_y=1000, seed=seed)

        # calculate radius small delta (using 1 year, represents by T=1)
        var = calculate_z_var(T=plan_time, r=r, sigma=sigma_mat, B_support=matrix, p_dist=np.ones(matrix.shape[0])/matrix.shape[0], beta=beta, k=k, seed=seed)
        np.random.seed(seed)
        small_delta_array = (np.random.normal(0, np.sqrt(var), size=100)**2)*(calculate_numerator(plan_time, r, sigma_mat, matrix, np.ones(matrix.shape[0])/matrix.shape[0], beta=beta, k=k, seed=seed)/calculate_denominator(plan_time, r, sigma_mat, matrix, np.ones(matrix.shape[0])/matrix.shape[0], beta=beta, k=k, seed=seed))
        small_delta = np.percentile(small_delta_array, 95)/n_to_average
        
        # calculate delta_B (using 1 year, represents by T=1)
        rng = np.random.default_rng(seed)
        delta_B = compute_big_delta_star(matrix, r, plan_time, beta, small_delta, sigma_mat, rng=rng)
        
        month_r = np.power(1+r, 1/12)-1 
        # add month_r to curr_all_ret for drmv
        curr_ret_for_drmv = np.append(curr_all_ret, month_r)
        daily_kara = 1
        daily_drbc = 1
        granular_r = np.power(1+r, dt)-1
        # last day not trade since no price for next day
        for j in range(1,int((curr_data['date'].nunique()/dt)/252)):
            kara_frac_daily = pi_fraction_exact(t=j*dt, Yt=yt[0][j-1], T=plan_time, alpha=beta, r=r, sigma=sigma_mat,
                        joint_z_vectors=matrix, p_dist=np.ones(matrix.shape[0])/matrix.shape[0],
                        num_expectation_samples=5000, seed=seed)
            drbc_frac_daily = pi_fraction_exact(t=j*dt, Yt=yt[0][j-1], T=plan_time, alpha=beta, r=r, sigma=sigma_mat, 
                        joint_z_vectors=matrix+delta_B, p_dist=np.ones(matrix.shape[0])/matrix.shape[0],
                        num_expectation_samples=5000, seed=seed)
            daily_kara *= (1-kara_frac_daily.sum())*granular_r+np.dot(kara_frac_daily, price_st_intra[j] / price_st_intra[j-1] - 1)+1
            daily_drbc *= (1-drbc_frac_daily.sum())*granular_r+np.dot(drbc_frac_daily, price_st_intra[j] / price_st_intra[j-1] - 1)+1
            
        # kara_wealth_list.append((kara_wealth_list[-1]*(1-kara_frac.sum())*month_r+np.dot(kara_frac, curr_all_ret)+1)*kara_wealth_list[-1])
        # drbc_wealth_list.append((drbc_wealth_list[-1]*(1-drbc_frac.sum())*month_r+np.dot(drbc_frac, curr_all_ret)+1)*drbc_wealth_list[-1])
        drmv_wealth_list.append(drmv_wealth_list[-1]*(1+np.dot(drmv_weights, curr_ret_for_drmv)))
        kara_wealth_list.append(daily_kara*kara_wealth_list[-1])
        drbc_wealth_list.append(daily_drbc*drbc_wealth_list[-1])

    return kara_wealth_list, drbc_wealth_list, drmv_wealth_list

In [None]:
a = main_sim_new_hour(daily_df=df_day, intraday_df=df_intraday)
for i in range(len(a[0])):
    print(a[0][i], a[1][i], a[2][i])

Initially selected stocks: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]


  0%|          | 0/12 [00:00<?, ?it/s]

1 1.0149848763024376 1.0145779288733259
2 1.0134495410856965 1.0126736727503591
3 0.986527626002127 0.9855646463749171
4 0.9853557287737111 0.9835940781522544
5 0.981019066152363 0.9792693770556418
6 0.9732489857315383 0.9713476725128296
7 0.9837319846403157 0.9815945786919802
8 0.9913336652958817 0.9914096566805012
9 1.0237748425869286 1.021980046356594
10 1.02674035602294 1.0248978396619939
11 1.0299870465906147 1.0266408309379798
12 1.0438615814812158 1.039677462475838
13 1.0565373932292839 1.0514017763490642
14 1.1116151018203073 1.1057824284027424
15 1.0921539419249182 1.0866742507510452
16 1.0509189207787581 1.0492966597789315
17 1.0819030654162638 1.0806154178571195
18 0.973011825117508 0.9727078951860185
19 1.0235204335583885 1.0233423400057269
20 1.0174396905308574 1.017810749469932
21 0.9333543281422333 0.934866343513519
22 0.9317521577044978 0.9328175411442011
23 0.8993870714714415 0.9008683569698814
24 0.9579135130265823 0.9599829425478549
25 0.9270444772214522 0.9276133657

  0%|          | 0/12 [07:44<?, ?it/s]


KeyboardInterrupt: 

In [161]:
def main_sim_new(input_df, r=0.02, seed=42, start_date='2024-01-01', end_date='2024-12-31',
                 beta=-3, num_stocks=20, plan_time=1/12):
    # Step 1: Load data
    
    df = input_df
    
    # Sort by permno and date to ensure proper ordering for log return calculation
    df = df.sort_values(['permno', 'date'])
    df['date'] = pd.to_datetime(df['date'])
    # Calculate log returns for each stock
    
    
    # Step 1: Find stocks with complete data from 2005-12-31 to 2015-01-01
    initial_start = pd.to_datetime(start_date) - relativedelta(years=10)    
    initial_end = pd.to_datetime(start_date) - pd.Timedelta(days=1)
    
    # Get stocks that have data in the initial period
    initial_period_data = df[(df['date'] >= initial_start) & (df['date'] <= initial_end)]
    
    # Count trading days in the initial period for validation
    total_trading_days = initial_period_data['date'].nunique()
    # print(f"Total trading days in initial period: {total_trading_days}")
    
    # Find stocks with sufficient data coverage (at least 80% of trading days)
    stock_coverage = initial_period_data.groupby('permno')['date'].nunique()
    min_required_days = int(total_trading_days)  # Require at least 80% coverage
    valid_stocks_initial = stock_coverage[stock_coverage >= min_required_days].index.tolist()
    
    # print(f"Stocks with sufficient data in initial period: {len(valid_stocks_initial)}")
    
    # Sample num_stocks stocks from those with complete initial data
    np.random.seed(seed)  # For reproducibility
    selected_stocks = np.sort(np.random.choice(valid_stocks_initial, num_stocks, replace=False))
    
    print(f"Initially selected stocks: {selected_stocks}")
    
    # Step 2: Process monthly data starting from 2015-01-01
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    month_starts = pd.date_range(start=start_date, end=end_date, freq='MS')
    
    current_stocks = selected_stocks.copy()
    kara_wealth_list = [1]
    drbc_wealth_list = [1]
    drmv_wealth_list = [1]
    for i, current_month in enumerate(tqdm(month_starts)):
        # print(f"\nProcessing month {i+1}/{len(month_starts)}: {current_month}")
        
        # Define time windows
        train_start = current_month - relativedelta(years=10)
        train_end = current_month - pd.Timedelta(days=1)
        test_start = current_month
        test_end = month_starts[i+1] - pd.Timedelta(days=1) if i+1 < len(month_starts) else pd.to_datetime(end_date)
        
        # print(f"Previous 10 years: {train_start.date()} to {train_end.date()}")
        # print(f"Next month: {test_start.date()} to {test_end.date()}")
        
        prev_to_next_dates = df[(df['date'] >= train_start) & (df['date'] <= test_end)]['date'].nunique()
        
        # Efficiently find all stocks with 100% coverage using vectorized operations
        # Get data for the entire period (train + test)
        full_period_start = train_start
        full_period_end = test_end
        full_period_data = df[(df['date'] >= full_period_start) & (df['date'] <= full_period_end)]
        full_period_stock_dates = full_period_data.groupby('permno')['date'].nunique()
        all_valid_stocks = full_period_stock_dates[full_period_stock_dates >= prev_to_next_dates].index.values
        
        # Check which current stocks are still valid
        valid_current_stocks = np.intersect1d(current_stocks, all_valid_stocks)
        
        # print(f"Current stocks with 100% coverage: {len(valid_current_stocks)} out of {len(current_stocks)}")
        # print(f"Total stocks available with 100% coverage: {len(all_valid_stocks)}")
        
        # If we need to replace stocks to maintain 20 stocks
        stocks_needed = num_stocks - len(valid_current_stocks)
        
        if stocks_needed > 0:
            # print(f"Need to find {stocks_needed} replacement stocks")
            
            # Find replacement candidates (exclude currently valid stocks)
            replacement_candidates = np.setdiff1d(all_valid_stocks, valid_current_stocks)
            
            # print(f"Available replacement candidates: {len(replacement_candidates)}")
            # add to 20 stocks
            stocks_to_add = np.random.choice(replacement_candidates, stocks_needed, replace=False)
            # Use all available replacements, even if less than needed
            current_stocks = np.sort(np.concatenate([valid_current_stocks, stocks_to_add]))

        else:
            current_stocks = valid_current_stocks
            # print("All current stocks are valid, no replacement needed")
        
        # print(f"Final stock selection for this month: {current_stocks}")
        # print(f"Number of stocks: {len(current_stocks)}")
        
        # Get training data for the selected stocks
        pretrain_data = df[(df['date'] >= train_start) & (df['date'] <= train_end) & 
                          (df['permno'].isin(current_stocks))]
        drmv_weights = run_single_backtest_select_stocks(
            training_data=pretrain_data,
            selected_perms=current_stocks,
            annual_target_return=0.105,
            r=r)
        length = len(pretrain_data) / len(current_stocks)
        prev_sigma_start_dt = (train_start - relativedelta(months=1))
        #prev_sigma_start_dt = (train_start - relativedelta(years=1))
        to_get_B_data = df[(df['date'] >= prev_sigma_start_dt) & (df['date'] <= train_end) & 
                          (df['permno'].isin(current_stocks))]
        matrix, _ = compute_annualized_matrix_type(to_get_B_data, sigma_real)
        ret_matrix = pretrain_data.pivot(index='date', columns='permno', values='ret')
        ret_matrix = ret_matrix.fillna(0)
        
       
        # use real sigma matrix (already annualized)
        sigma_mat = sigma_real #np.linalg.cholesky(cov)
        curr_data = df[(df['date'] <= test_end) & (df['date'] >= current_month)&(df['permno'].isin(current_stocks))]
        dt = 1/length
        t_list = np.linspace(0, 1, 252)
        price_st = curr_data.pivot(index='date', columns='permno', values='prc_adjusted').fillna(method='ffill').values
        curr_all_ret = price_st[-1] / price_st[0] - 1
        yt = St_to_Yt_vectorized(price_st[np.newaxis, :, :], price_st[0], sigma_mat, r, t_list[1:int(len(curr_data)/20)+1]) # can be t_list[0:len(curr_data)]
        k= solve_k_with_EL(matrix, r=r, sigma=sigma_mat, T=plan_time, beta=beta, num_y=1000, seed=seed)

        # calculate radius small delta (using 1 year, represents by T=1)
        var = calculate_z_var(T=plan_time, r=r, sigma=sigma_mat, B_support=matrix, p_dist=np.ones(matrix.shape[0])/matrix.shape[0], beta=beta, k=k)
        small_delta_array = (np.random.normal(0, np.sqrt(var), size=100)**2)*(calculate_numerator(plan_time, r, sigma_mat, matrix, np.ones(matrix.shape[0])/matrix.shape[0], beta=beta, k=k)/calculate_denominator(plan_time, r, sigma_mat, matrix, np.ones(matrix.shape[0])/matrix.shape[0], beta=beta, k=k))
        small_delta = np.percentile(small_delta_array, 95)/40
        
        # calculate delta_B (using 1 year, represents by T=1)
        delta_B = compute_big_delta_star(matrix, r, plan_time, beta, small_delta, sigma_mat)
        
        month_r = np.power(1+r, 1/12)-1 
        # add month_r to curr_all_ret for drmv
        curr_ret_for_drmv = np.append(curr_all_ret, month_r)
        daily_kara = 1
        daily_drbc = 1
        daily_r = np.power(1+r, 1/252)-1
        # last day not trade since no price for next day
        for j in range(1,curr_data['date'].nunique()):
            kara_frac_daily = pi_fraction_exact(t=j/252, Yt=yt[0][j-1], T=plan_time, alpha=beta, r=r, sigma=sigma_mat,
                        joint_z_vectors=matrix, p_dist=np.ones(matrix.shape[0])/matrix.shape[0],
                        num_expectation_samples=5000, seed=seed)
            drbc_frac_daily = pi_fraction_exact(t=j/252, Yt=yt[0][j-1], T=plan_time, alpha=beta, r=r, sigma=sigma_mat, 
                        joint_z_vectors=matrix+delta_B, p_dist=np.ones(matrix.shape[0])/matrix.shape[0],
                        num_expectation_samples=5000, seed=seed)
            daily_kara *= (1-kara_frac_daily.sum())*daily_r+np.dot(kara_frac_daily, price_st[j] / price_st[j-1] - 1)+1
            daily_drbc *= (1-drbc_frac_daily.sum())*daily_r+np.dot(drbc_frac_daily, price_st[j] / price_st[j-1] - 1)+1
            
            
        # kara_wealth_list.append((kara_wealth_list[-1]*(1-kara_frac.sum())*month_r+np.dot(kara_frac, curr_all_ret)+1)*kara_wealth_list[-1])
        # drbc_wealth_list.append((drbc_wealth_list[-1]*(1-drbc_frac.sum())*month_r+np.dot(drbc_frac, curr_all_ret)+1)*drbc_wealth_list[-1])
        drmv_wealth_list.append(drmv_wealth_list[-1]*(1+np.dot(drmv_weights, curr_ret_for_drmv)))
        kara_wealth_list.append(daily_kara*kara_wealth_list[-1])
        drbc_wealth_list.append(daily_drbc*drbc_wealth_list[-1])

    return kara_wealth_list, drbc_wealth_list, drmv_wealth_list

In [162]:
b = main_sim_new(sim_df)

Initially selected stocks: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]


100%|██████████| 12/12 [00:37<00:00,  3.09s/it]


In [163]:
for i in range(len(b[0])):
    print(b[0][i], b[1][i], b[2][i])

1 1 1
1.131022826203763 1.1050645848799752 1.012418018656068
1.2144529070425427 1.1601536438012992 1.0257958812660561
1.089108260659705 1.0749891426547524 1.0143646161930415
1.1629329160342259 1.1488161486473039 1.0187401561473046
1.0970245458996184 1.0928991032635385 1.018295694471291
1.1831434925593105 1.170406270711924 1.0303369712956925
1.2940546736530993 1.2671131232395831 1.0356019472961155
1.3611240772687132 1.303262263543781 1.045469085209132
1.5260745591506752 1.4348467258983542 1.0629519000104395
1.531003060006961 1.4401413646006256 1.0690385648874023
1.818748843280903 1.7054870553769472 1.084657178877144
1.545333001829128 1.519724143201071 1.0720840286712676
