import numpy as np # 数据处理最重要的模块
import pandas as pd # 数据处理最重要的模块
import matplotlib.pyplot as plt  # 画图模块
import scipy.stats as stats # 统计模块
import scipy
from datetime import datetime # 时间模块
from IPython.core.interactiveshell import InteractiveShell # jupyter运行输出的模块
import statsmodels.formula.api as smf  # OLS regression

#输出矢量图 渲染矢量图 是一个魔法函数（Magic Functions）内嵌绘图
%matplotlib inline 
%config InlineBackend.figure_format = 'svg'

#显示每一个运行结果
InteractiveShell.ast_node_interactivity = 'all'

#设置行不限制数量
#pd.set_option('display.max_rows',None)

#设置列不限制数量
pd.set_option('display.max_columns', None)

In [None]:
data = pd.read_csv('C:\Users\ignorance\Desktop\Python\数据\000001.csv')
data['Day'] = pd.to_datetime(data['Day'],format='%Y/%m/%d')
data.set_index('Day', inplace = True)
data.sort_values(by = ['Day'],axis=0, ascending=True)

In [None]:
daily_data = data['1997-01':'2022-07'].copy()
daily_data['Close'] = pd.to_numeric(daily_data['Close'])
daily_data['Preclose'] = pd.to_numeric(daily_data['Preclose'])
# 计算000001上证指数日收益率 两种：
daily_data['Raw_return'] = daily_data['Close'] / daily_data['Preclose'] - 1
daily_data['Log_return'] = np.log(daily_data['Close']) - np.log(daily_data['Preclose'])
daily_data

In [None]:
Month_data = daily_data.resample('m')['Log_return'].sum().to_frame()
Month_data['Raw_return'] = np.exp(Month_data['Log_return']) - 1
Month_data.reset_index(inplace=True)
Month_data.rename(columns={'Day':'month'},inplace=True)
Month_data.set_index('month',inplace=True)
Month_data

In [None]:
Quarter_data = daily_data.resample('Q')['Log_return'].sum().to_frame()
Quarter_data['Raw_return'] = np.exp(Quarter_data['Log_return']) - 1
Quarter_data

In [None]:
Year_data = daily_data.resample('Y')['Log_return'].sum().to_frame()
Year_data['Raw_return'] = np.exp(Year_data['Log_return']) - 1
Year_data

In [None]:
inflation = pd.read_csv('C:\Users\ignorance\Desktop\Python\inflation.csv')
inflation['month'] = pd.to_datetime(inflation['month'],format='%Y/%m/%d')
inflation.set_index('month',inplace=True)
inflation.sort_values(by=['month'],axis=0,ascending=True)

月度数据的预测
A simple linear regression of an asset return on one or a few lagged predictors of interest is the most popular econometric approach for testing for return predictability. For simplicity, consider a univariate predictive regression of the period- (t+1) stock market return rt+1 on a single predictor variable xt:
rt+1=α+βxt+εt+1
where εt+1 is a zero-mean, unpredictable disturbance term. When xt is the inflation rate, dividend yield, book-to-price ratio, or turnover. Many researchers find that β is significantly different from zero; that is, there is in-sample evidence of stock market return predictability.

H0:β=0
H1:β≠0(我们需要通过理论分析，得出β的符号

波动率

In [None]:
market_variance = daily_data.resample('M').apply({
    'Raw_return':
    lambda x: sum(x**2)
})
market_variance.reset_index(inplace=True)
market_variance.rename(columns={'Day':'month','Raw_return':'MV'},inplace=True)
market_variance.set_index('month',inplace=True)
market_variance

# market_variance <- daily_data[,.(MV = sum(Raw_return^2)),by = 'month']

换手率
模型 价量模型
rt+1=α+β∗turnovert+εt+1
两个假设：

换手率越高，意味着股票市场的交易越活跃，投资者们不停地在交易股票，也就是说一只股票有很多人在买也有很多人在卖，大家即有人愿意买，也有人愿意卖。这说明大家对同一只股票的看法不一样，市场整体的换手率增加了意味着投资者对于股票市场的判断分歧比较大，这种不确定性的增加有可能就是股票市场预期风险的增加，所以未来的股票的收益率上升。

H1： β>0
换手率越高，意味着股票市场的交易越活跃，这种更加活跃的交易带来的是投资者的热情高涨，越来越多的投资者愿意投身到股票交易中去，股票市场被更多的交易炒热了，股票价格在同一时间上升，这容易造成在同期的股票市场价格被高估。然而市场总是理性的，在未来投资者会发现之前的价格被高估了，从而还会降到正常的水平，所以未来的股票收益率会下降（未来的价格会降下来）。

H2： β<0
换手率的定义如下：
TOt=∑d=1Dt(∑NdiNTSi,d,t∑NdiNOSi,d,t)
其中，NTSi,d,t是股票i在t月d日的交易量，NOSi,d,t是股票i在t月d日的流通股数。Dt是t月的交易天数，Nd是d日的交易股票数量。

每一天的交易量 A 100股 B 200股 C 300股，A的流通股1000，B股票的流通股是1500，C股票的流通股是5000,这一天的换手率是：
TOd=100+200+3001000+1500+5000=8
第二天是9%,第三天是10%,月换手率 （8+9+10）%

In [None]:
cross = pd.read_csv('C:\Users\ignorance\Desktop\Python\数据\cross_section.csv')
cross['month'] = pd.to_datetime(cross['month'],format='%Y-%m-%d')
cross['to_v'] = pd.to_numeric(cross['to_v'])
cross['floatingvalue'] = pd.to_numeric(cross['floatingvalue'])
cross = cross.dropna(subset=['to_v','floatingvalue'])
cross

In [None]:
turnover = pd.DataFrame(cross.groupby(['month']).apply(
    lambda x:
    np.average(x['to_v'],weights=x['floatingvalue']) 
))
turnover = turnover.rename(columns={0:'to'})
turnover['month']  = pd.date_range(start='1992', end='2022', freq='m')
turnover.set_index('month',inplace=True)
turnover

In [None]:
reg_data = pd.merge(Month_data,market_variance,on = 'month')
reg_data = pd.merge(reg_data,inflation,on = 'month')
reg_data = pd.merge(reg_data,turnover,on='month')
reg_data

描述性统计 Summary

In [None]:
reg_data['to'].describe().round(5)

In [None]:
reg_data['to'].skew()
reg_data['to'].kurt()

作图

In [None]:
%%time
fig = plt.figure(figsize=(10, 5)) # 图片比例
plt.plot(
    'to',  # 要画图的变量名
    '.-r',  # 线的类型
    linewidth = 1,  # 线的粗细
    data = reg_data['1995-01-01':'2021-12-31'])  # 画图的数据
plt.title("China's Stock Market Turnover") # 画图的标题
plt.xlabel('Month') # 画图的x轴名称
plt.ylabel('Turnover') # 画图的y轴名称

year_freq = pd.date_range(start='1995', end='2022', freq='y')
c = plt.xticks(year_freq, year_freq.year, rotation=90, fontsize=10)

fig.savefig('turnover.pdf', bbox_inches='tight')

In [None]:
%%time
fig = plt.figure(figsize=(10, 5))
ax1 = fig.add_subplot(1, 1, 1)  #(x, x, x)这里前两个表示几*几的网格，最后一个表示第几子图

ax1.plot(reg_data['MV'],
         color='blue',
         marker='.',
         linestyle='-',
         linewidth=1,
         markersize=6,
         alpha=0.4,
         label='Market Variance')
ax1.set_xlabel('month')  # 设置横坐标标签
ax1.set_ylabel('Variance')  # 设置左边纵坐标标签
ax1.legend(loc=2)  # 设置图例在左上方
ax1.set_title("Variance and Turnover: Monthly 1995-2021")  # 给整张图命名

ax2 = ax1.twinx()  #twinx()函数表示共享x轴
ax2.plot(reg_data['to'],
         color='red',
         marker='o',
         linestyle='-',
         linewidth=1,
         markersize=2,
         alpha=0.7,
         label='to')
ax2.set_ylabel('to')  # 设置右边纵坐标标签
ax2.legend(loc=1)  # 设置图例在右上方

year_freq = pd.date_range(start='1997', end='2022', freq='y')
ticks = ax1.set_xticks(year_freq)
labels = ax1.set_xticklabels(year_freq.year, rotation=90, fontsize='small')

fig = plt.gcf()
fig.savefig('mvto.pdf', bbox_inches='tight')

In [None]:
reg_data['lto'] = reg_data['to'].shift(1)
model_fore_mv = smf.ols('MV ~ lto',
                 data=reg_data['2001-01':'2021-12']).fit(
                     cov_type='HAC', cov_kwds={'maxlags': 6})
print(model_fore_mv.summary())

In [None]:
reg_data['lMV'] = reg_data['MV'].shift(1)
model_fore_mv = smf.ols('MV ~ lto + lMV',
                 data=reg_data['2001-01':'2021-12']).fit(
                     cov_type='HAC', cov_kwds={'maxlags': 6})
print(model_fore_mv.summary())

In [None]:
reg_data['fitted_mv'] = model_fore_mv.fittedvalues
reg_data

In [None]:
%%time
fig = plt.figure(figsize=(10, 5))

plt.plot(reg_data['MV'],
         color='blue',
         marker='.',
         linestyle='-',
         linewidth=1,
         markersize=6,
         alpha=0.4,
         label='Market Variance')
plt.xlabel('month')  # 设置横坐标标签
plt.ylabel('Variance')  # 设置左边纵坐标标签
#plt.legend(loc=2)  # 设置图例在左上方
plt.title("Variance and Turnover: Monthly 1995-2021")  # 给整张图命名

# ax2 = ax1.twinx()  #twinx()函数表示共享x轴
plt.plot(reg_data['fitted_mv'],
         color='red',
         marker='o',
         linestyle='-',
         linewidth=1,
         markersize=2,
         alpha=0.7,
         label='fitted_mv')
# ax2.set_ylabel('fitted_mv')  # 设置右边纵坐标标签
# ax2.legend(loc=1)  # 设置图例在右上方

year_freq = pd.date_range(start='1995', end='2022', freq='y')

c = plt.xticks(year_freq, year_freq.year, rotation=90, fontsize=10)

fig = plt.gcf()
fig.savefig('mvto2.pdf', bbox_inches='tight')

In [None]:
model_fore_ret = smf.ols('Raw_return ~ fitted_mv',
                 data=reg_data['2001-01':'2021-12']).fit(
                     cov_type='HAC', cov_kwds={'maxlags': 6})
print(model_fore_ret.summary())

换手率直接预测

In [None]:
reg_data['lto'] = reg_data['to'].shift(1)
model_to = smf.ols('Raw_return ~ lto',
                 data=reg_data['2001-01':'2021-12']).fit(
                     cov_type='HAC', cov_kwds={'maxlags': 6})
print(model_to.summary())

季度结果

In [None]:
reg_data

In [None]:
Qreg_data = reg_data.resample('Q').apply({
    'Raw_return':
    lambda x: np.exp(sum(np.log( 1 + x))) - 1,
    'to':
    lambda x: sum(x),
    'cpi':
    lambda x: sum(x)
})
Qreg_data

In [None]:
Qreg_data['lto'] = Qreg_data['to'].shift(1)
Qreg_data['lcpi'] = Qreg_data['cpi'].shift(1)
model_to = smf.ols('Raw_return ~ lto + lcpi',
                 data=Qreg_data['2001-01':'2021-12']).fit(
                     cov_type='HAC', cov_kwds={'maxlags': 2})
print(model_to.summary())

In [None]:
from statsmodels.iolib.summary2 import summary_col

info_dict = {'No. observations': lambda x: f"{int(x.nobs):d}"}

model_to = smf.ols('Raw_return ~ lto',
                 data=Qreg_data['2001-01':'2021-12']).fit(
                     cov_type='HAC', cov_kwds={'maxlags': 2})
model_cpi = smf.ols('Raw_return ~ lcpi',
                 data=Qreg_data['2001-01':'2021-12']).fit(
                     cov_type='HAC', cov_kwds={'maxlags': 2})
model_cpito = smf.ols('Raw_return ~ lto + lcpi',
                 data=Qreg_data['2001-01':'2021-12']).fit(
                     cov_type='HAC', cov_kwds={'maxlags': 2})

results_table = summary_col(results=[model_to, model_cpi, model_cpito],
                            float_format='%0.3f', #数据显示的格式，默认四位小数
                            stars=True, # 是否有*，True为有
                            model_names=['Quarter Turnover', 'Quarter Inflation', 'Quarter CPI & Inflation'],
                            info_dict=info_dict,
                            regressor_order=['Intercept', 'lto','lcpi'])

results_table.add_title(
    'Table - OLS Regressions: Forecast Quarterly Stock Market Return')

print(results_table)