In [None]:
##import packages

import numpy as np
import pandas as pd

In [None]:
#define a class including all parameters
#本模块将所有可能用到的参数都在程序开头用类的形式加以定义，类名为 Para。

class Para:
    method = 'SVM' ##or 其他方法
    month_in_sample = range(82, 153 + 1) ##return 82~153 ,72 months,样本内数据对应月份
    month_test = range(154, 230 + 1) ##return 154~230,77 month，样本外数据对应月份
    percent_select = [0.3, 0.3] ##股票池中选 30% 正样本，30% 负样本
    percent_cv = 0.1 ## 10%交叉验证样本（即十折交叉验证）
    path_data = '.\\csv_demo\\'
    path_results = '.\\results_demo\\'
    seed = 43 #随机种子
    svm_kernel = 'linear' ##svm参数，线性核 
    ##（poly:多项式核（3阶默认）；sigmoid：Sigmoid核；rbf：高斯核）
    svm_c = 0.01 ##svm参数，svm的惩罚系数C
    
Para = Para()


In [None]:
#label data
#输入全部样本，选择超额收益最高和最低的部分样本，分别标记为1和0
#再将未被标记的样本剔除， 返回标记完成的样本
def label_data(data):
    #initialize
    data['return_bin'] = np.nan
    #sort by excess return
    data = data.sort_values[by='return', ascending=False]
    #decide how much stocks will be selected
    n_stock_select = np.multiply(para.percent_select, data.shape[0])
    n_stock_select = np.around(n_stock_select).astype(int) #股票数量需是整数
    #assign 1 or 0
    data.iloc[0: n_stock_select[0], -1] = 1
    data.iloc[-n_stock_select[0]:, -1] = 0
    #remove other stocks
    data = data.dropna(axis = 0)
return data
    

In [None]:
#generate in-sample data
#将数据按月份顺序逐个导入内存，并将所有月份数据拼接成一个大的dataframe
for i_month in para.month_in_sample:
    #load scv
    file_name = para.path_data + str(i_month) + '.csv'
    data_curr_month = pd.read_csv(file_name, header = 0)
    para.n_stock = data_curr_month.shape[0]
    #remove nan
    data_curr_month = data_curr_month.dropna(axis = 0)
    #label data
    data_curr_month = label_data(data_curr_month)
    #merge
    if i_month == pata.month_in_sample[0]:
        data_in_sample = data_curr_month
    else:
        data_in_sample = data_in_sample.append(data_curr_month)
        


In [None]:
#数据预处理
#将样本内集合切分成训练集和交叉验证集， 并通过主成分分析进行降维以及去除因
#子共线性。最后得到四个数组，其中 X_train 与 y_train 为训练集的特征和标签， 
#X_cv 与y_cv 为交叉验证集的特征和标签。
X_in_sample = data_in_sample.loc[:, 'EP': 'bias']
y_in_sample = data_in_sample.loc[:, 'return_bin']
#取 DataFrame 中的 return_bin 列作为样本内集合的标签 y
form sklearn.model_selection import train_test_split
X_train, X_cv, y_train, y_cv = train_test_split(X_in_sample, y_in_sample, test_size = para.percent_cv,
                                               random_state = para.seed)

#pca
from sklearn import decomposition
pca = decomposition.PCA(n_components = 0.95)
# PCA模型取相应比例的主成分数量，default = 0.95;
# 当 n_components 为正整数时， PCA 模型取相应数目的主成分
pca.fit(X_train)
X_train = pca.transform(X_train)
X_cv = pca.transform(X_cv)

## for regression
#y_in_sample = data_in_sample.loc[:, 'return']
#直接取DataFrame中的return列


#数据标准化
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_trian)
X_train = scaler.transform(X_train)
X_cv = scaler.transform(X_cv)


In [None]:
#模型设置
## SVM
if prar.method == 'SVM':
    from sklearn import svm
    model = svm.SVM(kernel = para.svm_kernel, C = para.svm_c)
    

    
## linear regression
if para.method == 'LR':
    from sklearn import linear_model
    model = linear_model.LinearRegression(fit_intercept = True)
    
    
    
    
    
## SGD + hinge
if pata.method == 'SGD':
    from sklearn import linear_model
    model = linear_model.SGDClassifier(loss = 'hinge', alpha = 0.0001, penalty = 'l2',
                                      max_iter = 5, random_state = para.seed)
        

In [None]:
##模型训练
##对分类模型
if para.method == 'SVM':
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_score_train = model.decision_function(X_trian)
    y_pre_cv = model.predict(X_cv)
    y_score_cv = model.decision_function(X_cv)
    
if para.method == 'SGD': #同SVM
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_score_train = model.decision_function(X_trian)
    y_pre_cv = model.predict(X_cv)
    y_score_cv = model.decision_function(X_cv)
    
    
##对回归模型
if pata.method == 'LR':
    model.fit(X_train, y_train)
    y_score_train = model.predict(X_train)
    y_score_cv = model.predict(X_cv)
    


In [None]:
##模型预测
#本模块使用训练完成的模型在测试集上做预测
#首先， 创建三个空数据集 y_true_test、y_pred_test 和 y_score_test， 
#分别用于存储预测集上的真实收益、预测标签和预测的决策函数值。 
#随后对测试集中的每个月进行预测，使用 for 循环遍历测试集的每个月。
# initialize
y_true_test = pd.DataFrame([np.nan] * np.ones((para.n_stock, para.month_test[-1])))
y_pred_test = pd.DataFrame([np.nan] * np.ones((para.n_stock, para.month_test[-1])))
y_score_test = pd.DataFrame([np.nan] * np.ones((para.n_stock, para.month_test[-1])))

# loop for month
for i_month in para.month_test:
    file_name = para.path_data + str(i_month) + '.scv'
    data_curr_month = pd.read_csv(file_name, header = 0)
    data_curr_month = data_curr_month.dropna(axis = 0) #remove nan
    X_curr_month = data_curr_month.loc[:, 'EP' : 'bias'] #generate X
    X_curr_month = pca.transform(X_curr_month) #pca
    
    #predict and get decision function
    if para.method == 'SVM':
        y_pred_curr_month = model.predict(X_curr_month)
        Y_score_curr_month = model.decision_function(X_curr_month)
        
    # save true and predicted return
    y_true_test.iloc[data_curr_month.index, i_month-1] = data_curr_month['return'][data_curr_month.index]
    y_pred_test.iloc[data_curr_month.index, i_month-1] = y_pred_curr_month
    y_score_test.iloc[data_curr_month.index, i_month-1] = y_score_curr_month

In [None]:
##模型评价
#模型评估部分主要计算：训练集、交叉验证集和测试集（ 每月） 的正确率和 AUC。
from sklern import metrics
print('training set, accuracy = %.2f'%metrics.accuracy_score(y_train,y_pred_trian))
print('training set, AUC = %.2f'%metrics.roc_auc_score(y_train,y_score_trian))
print('cv set, accuracy = %.2f'%metrics.accuracy_score(y_train,y_pred_cv))
print('cv set, AUC = %.2f'%metrics.roc_auc_score(y_train,y_score_cv))

# evaluate the testing set by months
for i_month in para.month_test:
    y_true_curr_month = pd.DataFrame({'return': y_true_test.iloc[:, i_month-1]})
    y_pred_curr_month = y_pred_test.iloc[:, i_month-1]
    y_score_curr_month = y_score_curr_month = y_score_test.iloc[:,i_month-1]
    
    #remove nan
    y_true_curr_month = y_true_curr_month.dropna(axis = 0)
    
    #lable data and select the best and worst 30% data
    y_curr_month = label_data(y_true_curr_month)['return_bin']
    y_pred_curr_month = y_pred_curr_month[y_curr_month.index]
    y_score_curr_month = y_score_curr_month[y_curr_month.index]
    
    print('test set, month %d, accuracy = %.2f'%(i_month, metrics.accuracy_score(y_curr_month, y_pred_curr_month)))
    print('test set, month %d, AUC = %.2f'%(i_month, metirc.roc_auc_score(y_curr_month, y_score_curr_month)))
    

In [None]:
##策略构建
#本模块围绕线性 SVM 模型的预测结果， 构建了一个简单策略：选取每个月最可能涨的 100
#只股票， 等权配置资产。 随后计算该策略的收益和净值。
# initialize a strategy
para.n_stock_select = 100
strategy = ps.DataFrame({'return':[0] * para.month_test[-1], 
                        'value':[1] * para.month_test[-1]})

# loop for months
for i_month in pata.month_test:
    y_true_curr_month = y_true_test.iloc[:, i_month-1]
    y_score_curr_month = y_score_test.iloc[:, i_month-1]
    
    #sort predicted return, and choose the best 100
    y_score_curr_month = y_score_curr_month.sort_values(ascending = False)
    index_select = y_score_curr_month[0:pata.n_stock_select].index
    
    #take the average return as the return of the protfolio
    strategy.loc[i_month-1, 'return'] = np.mean(y_true_curr_month[index_select])
    
#compute the compound value of strategy
strategy['value'] = (strategy['return'] + 1).cumprod()

In [None]:
##策略评价
# plot the value
import matplotlib.pyplot as plt
plt.plot(para.month_test, strategy.loc[para.month_test,'value'], 'r-')
plt.show()

# evaluation
ann_excess_return = np.mean(strategy.loc[para.month_test, 'return']) * 12
ann_excess_vol = np.std(strategy.loc[para.month_test,'return']) * np.sqrt(12)
info_ratio = ann_excess_retur/ann_excess_vol

#print out 
print('annual excess return = %.2f'% ann_excess_return)
print('annual excess volatility = %.2f'% ann_excess_vol)
print('information ratio = %.2f'% info_rario)

