In [3]:
import numpy as np
import pandas as pd
import warnings
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

In [8]:
df_final=pd.read_feather('./China.feather')
characteristics = df_final.columns.to_list()
characteristics.remove("ret-rf")
characteristics.remove("ret-rf-new")
characteristics.remove("permno")
characteristics.remove("date")
print(len(characteristics))
df_final[characteristics+['ret-rf']] = df_final[characteristics+['ret-rf']] .fillna(0) # 参考machine-learning-for-trading/20_autoencoders_for_conditional_risk_factors/06_conditional_autoencoder_for_asset_pricing_model.ipynb中的方式
firm_all=df_final['permno'].unique()
print(len(firm_all))
n_characteristics=len(characteristics)
print(df_final.head())
print(df_final.tail())

6
5123
   permno    date  Opndt  Mopnprc  Mclsprc  Mnshrtrd  Mnvaltrd  Msmvosd  \
0       2  199101    1.0     -1.0     -1.0       0.6      -1.0      1.0   
1       2  199102   -0.2     -1.0     -1.0       1.0       0.6      1.0   
2       2  199103   -1.0     -1.0     -1.0       1.0       0.2      1.0   
3       2  199104    1.0     -1.0     -1.0       0.2      -0.2      1.0   
4       2  199105   -1.0     -1.0     -1.0       1.0       0.6      1.0   

     ret-rf  ret-rf-new  
0  0.036103    0.039972  
1  0.039972   -0.126574  
2 -0.126574   -0.116085  
3 -0.116085   -0.030901  
4 -0.030901   -0.303809  
        permno    date  Opndt   Mopnprc   Mclsprc  Mnshrtrd  Mnvaltrd  \
667375  900957  202212   -1.0 -0.990130 -0.990541 -0.950236 -0.994242   
667376  900957  202301   -1.0 -0.990558 -0.990148 -0.933498 -0.990558   
667377  900957  202302   -1.0 -0.990398 -0.991011 -0.968539 -0.996731   
667378  900957  202303   -1.0 -0.991071 -0.991071 -0.954951 -0.995942   
667379  900957  20230

In [9]:
start_date_train = 199101
end_date_train = 202101
start_date_val = 202101
end_date_val = 202312
df_train = df_final[(df_final['date'] >= start_date_train) & (df_final['date'] < end_date_train)]
df_val = df_final[(df_final['date'] >= start_date_val) & (df_final['date'] < end_date_val)]

In [10]:
pca = PCA(n_components=6)
pca.fit(df_train[characteristics])
reg = LinearRegression()
reg.fit(pca.transform(df_train[characteristics]), df_train['ret-rf'])
output = reg.predict(pca.transform(df_val[characteristics]))
r2 = 1 - ((df_val['ret-rf'].to_numpy() - output) ** 2).sum() / (df_val['ret-rf'].to_numpy() ** 2).sum()
print(r2)

0.4222711774415895


In [16]:
start_date_train = 199101
end_date_train = 202101
start_date_val = 202101
end_date_val = 202312
for n_components in range(1, 7):
    r2s_train = []
    r2s_val = []
    lengths = []
    for _, each_df in df_final.groupby(by='permno'):
        each_df_train = each_df[(each_df['date'] >= start_date_train) & (each_df['date'] < end_date_train)]
        each_df_val = each_df[(each_df['date'] >= start_date_val) & (each_df['date'] < end_date_val)]
        if (len(each_df_train) >= 50 and len(each_df_val) >= 10):
            lengths.append(len(each_df))
            pca = PCA(n_components=n_components)
            pca.fit(each_df_train[characteristics])
            reg = LinearRegression()
            reg.fit(pca.transform(each_df_train[characteristics]), each_df_train['ret-rf'])
            output = reg.predict(pca.transform(each_df_train[characteristics]))
            r2 = 1 - ((each_df_train['ret-rf'].to_numpy() - output) ** 2).sum() / (each_df_train['ret-rf'].to_numpy() ** 2).sum()
            r2s_train.append(r2)
            output = reg.predict(pca.transform(each_df_val[characteristics]))
            r2 = 1 - ((each_df_val['ret-rf'].to_numpy() - output) ** 2).sum() / (each_df_val['ret-rf'].to_numpy() ** 2).sum()
            r2s_val.append(r2)
    print(len(lengths))
    # print(sum(1 for i in lengths if i >= 200))
    print(sum(r2s_train) / len(r2s_train))
    print(sum(r2s_val) / len(r2s_val))

2767
0.0287411153942282
-0.03863195120862436
2767
0.04940180731174667
-0.05827311754690058
2767
0.06662117454083473
-0.10598970143104862
2767
0.08807170420306633
-0.26109315110504683
2767
0.2656744431986707
-0.05762797608742177
2767
0.37645673502075117
0.18452272589328272


In [15]:
start_date_trains = [199101, 200101, 201101]
end_date_trains = [200001, 201001, 202201]
start_date_vals = [200001, 201001, 202201]
end_date_vals = [200101, 201101, 202312]
for n_components in range(1, 7):
    r2s_train = []
    r2s_val = []
    lengths = []
    for _, each_df in df_final.groupby(by='permno'):
        each_df_trains = []
        each_df_vals = []
        for i in range(len(start_date_trains)):
            each_df_trains.append(each_df[(each_df['date'] >= start_date_trains[i]) & (each_df['date'] < end_date_trains[i])])
            each_df_vals.append(each_df[(each_df['date'] >= start_date_vals[i]) & (each_df['date'] < end_date_vals[i])])
        # print(len(each_df_trains))
        # print(len(each_df_vals))
        each_df_train = pd.concat(each_df_trains)
        each_df_val = pd.concat(each_df_vals)
        if (all(len(each_df_train_decade) >= 10 for each_df_train_decade in each_df_trains) and 
            all(len(each_df_val_decade) >= 2 for each_df_val_decade in each_df_vals)):
            # print([len(each_df_train_decade) for each_df_train_decade in each_df_trains])
            # print([len(each_df_val_decade) for each_df_val_decade in each_df_vals])
            lengths.append(len(each_df))
            pca_first_step = PCA(n_components=6)
            pca_first_step.fit(each_df_train[characteristics])
            pcas_second_step = []
            regs = []
            r2_numerator_train = 0; r2_denominator_train = 0; r2_numerator_val = 0; r2_denominator_val = 0
            for each_df_train_decade, each_df_val_decade in zip(each_df_trains, each_df_vals):
                pca_second_step = PCA(n_components=n_components)
                pcas_second_step.append(pca_second_step)
                pca_second_step.fit(pca_first_step.transform(each_df_train_decade[characteristics]))
                reg = LinearRegression()
                regs.append(reg)
                reg.fit(pca_second_step.transform(pca_first_step.transform(each_df_train_decade[characteristics])), each_df_train_decade['ret-rf'])
                output = reg.predict(pca_second_step.transform(pca_first_step.transform(each_df_train_decade[characteristics])))
                r2_numerator_train += ((each_df_train_decade['ret-rf'].to_numpy() - output) ** 2).sum()
                r2_denominator_train += (each_df_train_decade['ret-rf'].to_numpy() ** 2).sum()
                output = reg.predict(pca_second_step.transform(pca_first_step.transform(each_df_val_decade[characteristics])))
                r2_numerator_val += ((each_df_val_decade['ret-rf'].to_numpy() - output) ** 2).sum()
                r2_denominator_val += (each_df_val_decade['ret-rf'].to_numpy() ** 2).sum()
            r2 = 1 - r2_numerator_train / r2_denominator_train
            r2s_train.append(r2)
            r2 = 1 - r2_numerator_val / r2_denominator_val
            r2s_val.append(r2)
    print(len(lengths))
    # print(sum(1 for i in lengths if i >= 200))
    print(sum(r2s_train) / len(r2s_train))
    print(abs(sum(r2s_val) / len(r2s_val)))

736
0.04532934649651339
0.006718646969672622
736
0.07196348773769772
0.09164112357010472
736
0.10761750536975892
0.1791660535084386
736
0.20097681107087378
0.39759548214345175
736
0.35578221142270255
0.266091086804168
736
0.42165592879886127
0.6561138886438577


In [23]:
start_date_trains = [199101, 200101, 201101]
end_date_trains = [200001, 201001, 202201]
start_date_vals = [200001, 201001, 202201]
end_date_vals = [200101, 201101, 202312]
df_alternate = df_final[(df_final['date'] >= start_date_trains[0]) & (df_final['date'] < end_date_vals[-1])]\
    .set_index(['permno', 'date'])[characteristics].stack().unstack(level=0)

In [24]:
df_alternate['X_factor'] = 0
X_factors = []
df_final.fillna(0)
for date, each_df in df_final[(df_final['date'] >= start_date_trains[0]) & (df_final['date'] < end_date_vals[-1])].groupby(by='date'):
    try:
        # X_factor = np.linalg.solve(each_df[characteristics].values.T @ each_df[characteristics].values, each_df[characteristics].values.T @ each_df['ret-rf'].values)
        X_factor = np.linalg.pinv(each_df[characteristics].values.T @ each_df[characteristics].values) @ each_df[characteristics].values.T @ each_df['ret-rf'].values
        # print(X_factor)
    except np.linalg.LinAlgError:
        continue
    s = pd.Series(characteristics, X_factor)
    s['date'] = date
    df_alternate.loc[:, 'X_factor'] = pd.Series(X_factor, pd.MultiIndex.from_tuples([(date, a) for a in characteristics]))
    X_factors.append(pd.Series(X_factor, pd.MultiIndex.from_tuples([(date, a) for a in characteristics])))
    # print(X_factors[-1])
df_alternate['X_factor'] = pd.concat(X_factors)
df_alternate = df_alternate.reset_index().fillna(0)

In [26]:
permnos = df_alternate.columns.to_list()
permnos.remove('X_factor')
permnos.remove('date')
permnos.remove('level_1')
for n_components in range(1, 7):
    r2s_train = []
    r2s_val = []
    lengths = []
    for _, each_df in df_alternate.groupby(by='level_1'):
        each_df_trains = []
        each_df_vals = []
        for i in range(len(start_date_trains)):
            each_df_trains.append(each_df[(each_df['date'] >= start_date_trains[i]) & (each_df['date'] < end_date_trains[i])])
            each_df_vals.append(each_df[(each_df['date'] >= start_date_vals[i]) & (each_df['date'] < end_date_vals[i])])
        # print(len(each_df_trains))
        # print(len(each_df_vals))
        # print(each_df_trains[0])
        each_df_train = pd.concat(each_df_trains)
        each_df_val = pd.concat(each_df_vals)
        # if (all(each_df_train_decade.shape[0] * each_df_train_decade.shape[1] >= 2 * (each_df_train_decade == 0).to_numpy().sum() for each_df_train_decade in each_df_trains) and 
        #     all(each_df_val_decade.shape[0] * each_df_val_decade.shape[1] >= 2 * (each_df_val_decade == 0).to_numpy().sum() for each_df_val_decade in each_df_vals)):
        if (all(len(each_df_train_decade) >= 10 for each_df_train_decade in each_df_trains) and 
            all(len(each_df_val_decade) >= 2 for each_df_val_decade in each_df_vals)):
            # print([len(each_df_train_decade) for each_df_train_decade in each_df_trains])
            # print([len(each_df_val_decade) for each_df_val_decade in each_df_vals])
            lengths.append(len(each_df))
            pca = PCA(n_components=n_components)
            pca.fit(each_df_train[permnos])
            reg = LinearRegression()
            reg.fit(pca.transform(each_df_train[permnos]), each_df_train['X_factor'])
            output = reg.predict(pca.transform(each_df_train[permnos]))
            r2 = 1 - ((each_df_train['X_factor'] - output) ** 2).sum() / (each_df_train['X_factor'] ** 2).sum()
            r2s_train.append(r2)
            output = reg.predict(pca.transform(each_df_val[permnos]))
            r2 = 1 - ((each_df_val['X_factor'] - output) ** 2).sum() / (each_df_val['X_factor'] ** 2).sum()
            r2s_val.append(r2)
    print(len(lengths))
    # print(sum(1 for i in lengths if i >= 200))
    print(sum(r2s_train) / len(r2s_train))
    print(abs(sum(r2s_val) / len(r2s_val)))

6
0.26703909670648346
0.3104276382975009
6
0.2701638374166199
0.2836415798675022
6
0.27930174104161326
0.3285581846111965
6
0.2814585267279462
0.308554133443639
6
0.28182611092055504
0.34259166870880686
6
0.28415591658476563
0.3854151738098688


In [25]:
permnos = df_alternate.columns.to_list()
permnos.remove('X_factor')
permnos.remove('date')
permnos.remove('level_1')
for n_components in range(1, 7):
    r2s_train = []
    r2s_val = []
    lengths = []
    for _, each_df in df_alternate.groupby(by='level_1'):
        each_df_trains = []
        each_df_vals = []
        for i in range(len(start_date_trains)):
            each_df_trains.append(each_df[(each_df['date'] >= start_date_trains[i]) & (each_df['date'] < end_date_trains[i])])
            each_df_vals.append(each_df[(each_df['date'] >= start_date_vals[i]) & (each_df['date'] < end_date_vals[i])])
        # print(len(each_df_trains))
        # print(len(each_df_vals))
        # print(each_df_trains[0])
        each_df_train = pd.concat(each_df_trains)
        each_df_val = pd.concat(each_df_vals)
        # if (all(each_df_train_decade.shape[0] * each_df_train_decade.shape[1] >= 2 * (each_df_train_decade == 0).to_numpy().sum() for each_df_train_decade in each_df_trains) and 
        #     all(each_df_val_decade.shape[0] * each_df_val_decade.shape[1] >= 2 * (each_df_val_decade == 0).to_numpy().sum() for each_df_val_decade in each_df_vals)):
        if (all(len(each_df_train_decade) >= 10 for each_df_train_decade in each_df_trains) and 
            all(len(each_df_val_decade) >= 2 for each_df_val_decade in each_df_vals)):
            # print([len(each_df_train_decade) for each_df_train_decade in each_df_trains])
            # print([len(each_df_val_decade) for each_df_val_decade in each_df_vals])
            lengths.append(len(each_df))
            pca_first_step = PCA(n_components=60)
            pca_first_step.fit(each_df_train[permnos])
            pcas_second_step = []
            regs = []
            r2_numerator_train = 0; r2_denominator_train = 0; r2_numerator_val = 0; r2_denominator_val = 0
            for each_df_train_decade, each_df_val_decade in zip(each_df_trains, each_df_vals):
                pca_second_step = PCA(n_components=n_components)
                pcas_second_step.append(pca_second_step)
                pca_second_step.fit(pca_first_step.transform(each_df_train_decade[permnos]))
                reg = LinearRegression()
                regs.append(reg)
                reg.fit(pca_second_step.transform(pca_first_step.transform(each_df_train_decade[permnos])), each_df_train_decade['X_factor'])
                output = reg.predict(pca_second_step.transform(pca_first_step.transform(each_df_train_decade[permnos])))
                # print(output)
                r2_numerator_train += ((each_df_train_decade['X_factor'].to_numpy() - output) ** 2).sum()
                r2_denominator_train += (each_df_train_decade['X_factor'].to_numpy() ** 2).sum()
                output = reg.predict(pca_second_step.transform(pca_first_step.transform(each_df_val_decade[permnos])))
                r2_numerator_val += ((each_df_val_decade['X_factor'].to_numpy() - output) ** 2).sum()
                r2_denominator_val += (each_df_val_decade['X_factor'].to_numpy() ** 2).sum()
            r2 = 1 - r2_numerator_train / r2_denominator_train
            r2s_train.append(r2)
            r2 = 1 - r2_numerator_val / r2_denominator_val
            r2s_val.append(r2)
    print(len(lengths))
    # print(sum(1 for i in lengths if i >= 200))
    print(sum(r2s_train) / len(r2s_train))
    print(abs(sum(r2s_val) / len(r2s_val)))

6
0.28311549915060463
0.34899589505949025
6
0.29352450364368327
0.2790693238983149
6
0.3017164721836972
0.2435292403013154
6
0.30568851220295423
0.22939874288338222
6
0.3089384620684558
0.34331253822803937
6
0.31066167349859547
0.20090801718779602
