In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
from const import *     # 导入模拟常量

In [3]:
# 制作测试数据目录

os.system('mkdir Test_Data')
for t in range(1, N_days + 1):
    s = '{:0>n}'.replace('n', str(L_days)).format(t)
    if s not in os.listdir('Test_Data/'):
        os.system('mkdir Test_Data/' + s)

### 股票收益率 R ###

In [5]:
# S001 - S1000

stocks = np.array(['S' + '{:0>n}'.replace('n', str(L_stocks)).format(i) for i in range(1, N_stocks + 1)])

rng = np.random.RandomState(Seed_r)          # set the seed to make the example deterministic
for x in os.listdir('Test_Data/'):
    ret = 200 * (rng.rand(N_stocks,) - 0.5)
    df_ret = pd.DataFrame({'Share_Code': stocks, 'Returns': ret})
    df_ret.to_csv(f'Test_Data/{x}/daily_return.csv', index = 0)

### 股票日风险敞口矩阵 M ###

In [5]:
# 将超过3倍标准差之外的数据置为3倍标准差，实际计算因子时可能去掉该数据点，如置为NaN
# 生成数据时，随机将一定比例(<0.01)的因子值置为NaN，更贴近实际情况

zscore = lambda x: (x - np.nanmean(x)) / np.nanstd(x)  # 标准化, randn保证按正态分布抽样，zscore则将数据严格标准化为N(0, 1)

rng = np.random.RandomState(Seed_f)                    # set the seed to make the example deterministic
for x in os.listdir('Test_Data/'):
    d = {}
    for factor in range(1, N_factors + 1):
        a = rng.randn(N_stocks,)                                                     # 生成标准正态分布的一组因子值
        indices = rng.choice(np.arange(a.size), replace = False, size = int(a.size * 0.01 * np.random.random()))
        a[indices] = np.nan                                                          # 随机将少量因子值置为NaN 
        d['F' + '{:0>n}'.replace('n', str(L_factors)).format(factor)] = zscore(a)    # 重新标准化
    df_factors = pd.DataFrame(d)
    df_factors[np.abs(df_factors) > 3] = np.sign(df_factors) * 3                     # 处理三倍标准差以外的数据
    df_factors.insert(loc = 0, column = 'Share_Code', value = stocks)                # 添加股票名称列
    df_factors.to_csv(f'Test_Data/{x}/factors.csv', index = 0)                       # 将因子值写入csv文件

In [16]:
zscore = lambda x: (x - np.nanmean(x)) / np.nanstd(x)
d = {}
for factor in range(1, N_factors + 1):
    a = np.random.randn(N_stocks,)
    indices = np.random.choice(np.arange(a.size), replace = False, size = int(a.size * 0.01 * np.random.random()))
    a[indices] = np.nan                                            
    d['F' + '{:0>n}'.replace('n', str(L_factors)).format(factor)] = zscore(a)
df_factors = pd.DataFrame(d)
df_factors[np.abs(df_factors) > 3] = np.sign(df_factors) * 3
df_factors.insert(loc = 0, column = 'Share_Code', value = stocks)
df_factors.describe()

Unnamed: 0,F01,F02,F03,F04,F05,F06,F07,F08,F09,F10
count,99.0,100.0,96.0,96.0,97.0,100.0,97.0,97.0,97.0,99.0
mean,0.007266,-1.44329e-17,2.312965e-18,-1.387779e-17,0.002003,1.7763570000000002e-17,1.8312960000000002e-17,2.975856e-17,2.1746640000000003e-17,-0.001803
std,0.980219,1.005038,1.005249,1.005249,0.998937,1.005038,1.005195,1.005195,1.005195,0.999475
min,-3.0,-2.063788,-2.263519,-2.128725,-3.0,-2.337863,-2.343948,-2.912731,-2.408971,-2.361425
25%,-0.669543,-0.741555,-0.6334672,-0.6650172,-0.60793,-0.7509929,-0.6171428,-0.8383238,-0.8271429,-0.72578
50%,0.048507,0.03130902,0.03366524,-0.04179805,-0.018446,0.03190516,-0.05843503,0.07419129,-0.01637455,0.000354
75%,0.708325,0.6738379,0.6784727,0.721731,0.722356,0.7378538,0.6495413,0.6004452,0.8418479,0.694133
max,2.148947,2.12617,2.517919,2.260407,2.237725,2.221778,2.586371,2.301895,2.801045,3.0


In [13]:
data = pd.DataFrame(np.random.randn(100, 10))
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,-0.107243,-0.051621,0.017062,-0.020395,0.016242,-0.089571,-0.103684,-0.039291,0.084339,0.199428
std,0.900557,0.927093,0.858511,1.014705,1.081345,0.856808,1.175901,0.931416,0.940906,1.121033
min,-2.241267,-3.0,-2.121161,-2.652733,-3.0,-2.402019,-2.913674,-2.141209,-2.163834,-2.483926
25%,-0.687782,-0.678431,-0.674186,-0.78728,-0.581578,-0.665072,-0.906676,-0.695864,-0.541487,-0.430614
50%,-0.049227,-0.118924,0.0214,0.002266,0.039861,-0.060622,0.003365,-0.136766,0.083034,0.116752
75%,0.476657,0.561093,0.667243,0.695678,0.633358,0.533903,0.490146,0.428519,0.624922,0.826822
max,2.012929,3.0,2.049237,2.110887,2.290504,2.117615,3.0,2.396189,2.20293,3.0


### 股票行业分类矩阵 N ###

In [3]:
rng = np.random.RandomState(Seed_i)                           # set the seed to make the example deterministic
industries = rng.randint(N_industries, size = N_stocks)       # 行业分类一维表
industries

array([15,  4,  0, 11,  7, 11, 12,  1,  4, 10,  2,  8, 14,  7,  8,  0,  1,
        4,  4, 10,  8, 15, 14, 15, 11, 18, 10, 13, 10, 11, 13, 15,  1, 10,
       17, 13,  2, 18,  0, 16, 17,  9, 17,  5, 16,  3, 14,  8,  3, 10,  6,
       19, 19, 18,  1,  8, 17, 15,  3,  5, 15, 12, 16,  1, 16,  7, 17,  2,
       16,  1, 14, 16,  2,  4,  9, 19,  8, 13,  6, 10, 11, 15, 18,  1, 14,
        4,  1,  8,  4, 11,  9,  0,  6,  4,  0, 18,  8, 14,  5, 16,  4,  2,
       19, 11,  8,  6, 11,  0,  4,  4, 10, 16, 11,  4,  7,  7, 10, 13, 17,
        6, 17,  9, 17,  2,  7, 19, 17, 13,  9, 10, 13,  4,  5,  3,  7, 13,
       15, 11,  7,  7, 17,  3,  5, 12, 13,  5, 15,  2, 12, 19, 14, 19,  5,
        4, 10, 11,  3,  8,  2, 17, 12,  7, 10, 13, 10, 13, 12,  1,  9, 15,
       10,  3,  0,  1, 17,  0, 18,  3, 17, 12,  9, 15,  7,  6, 13,  5,  8,
        7,  9,  0, 10, 11, 13,  0,  3,  1, 11,  8, 15,  7, 13,  5,  6,  2,
       17,  1,  8, 11, 14, 12, 14,  3,  8, 17, 17,  0,  2, 17,  6, 17,  7,
       12, 17,  6, 12,  4

In [6]:
d = {'Share_Code': stocks}
for industry in range(1, N_industries + 1):
    a = np.zeros(N_stocks)                                    # 全是0的一列
    a[(industries == industry - 1).nonzero()[0]] = 1          # 属于该行业的股票相应值置为1
    d['I' + '{:0>n}'.replace('n', str(L_industries)).format(industry)] = a    
df_industries = pd.DataFrame(d)
df_industries

Unnamed: 0,Share_Code,I01,I02,I03,I04,I05,I06,I07,I08,I09,...,I11,I12,I13,I14,I15,I16,I17,I18,I19,I20
0,S0001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,S0002,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,S0003,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,S0004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,S0005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,S0996,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,S0997,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,S0998,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,S0999,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
for x in os.listdir('Test_Data/'):
    df_industries.to_csv(f'Test_Data/{x}/industries.csv', index = 0)

### 日投资组合 ###

In [5]:
def scale(w):
    w -= np.mean(w)              # 平移，使所有权重均值为0
    w *= 2 / sum(abs(w))         # rescale，使 long = short = 1
    return w

rng = np.random.RandomState(Seed_w)
for x in os.listdir('Test_Data/'):
    w = scale(rng.rand(N_stocks,))
    df_weight = pd.DataFrame({'Share_Code': stocks, 'Weight': w})
    df_weight.to_csv(f'Test_Data/{x}/daily_weight.csv', index = 0)