# Z 分数

## 安装软件包

In [None]:
import sys

In [None]:
!{sys.executable} -m pip install -r requirements.txt

In [None]:
import cvxpy as cvx
import numpy as np
import pandas as pd
import time
import os
import quiz_helper
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14, 8)

### 数据包

In [None]:
import os
import quiz_helper
from zipline.data import bundles

In [None]:
os.environ['ZIPLINE_ROOT'] = os.path.join(os.getcwd(), '..', '..','data','module_4_quizzes_eod')
ingest_func = bundles.csvdir.csvdir_equities(['daily'], quiz_helper.EOD_BUNDLE_NAME)
bundles.register(quiz_helper.EOD_BUNDLE_NAME, ingest_func)
print('Data Registered')

### 构建管道引擎

In [None]:
from zipline.pipeline import Pipeline
from zipline.pipeline.factors import AverageDollarVolume
from zipline.utils.calendars import get_calendar

universe = AverageDollarVolume(window_length=120).top(500) 
trading_calendar = get_calendar('NYSE') 
bundle_data = bundles.load(quiz_helper.EOD_BUNDLE_NAME)
engine = quiz_helper.build_pipeline_engine(bundle_data, trading_calendar)

### 查看数据

构建管道引擎后，我们获取时段结束时股票池中的股票。我们将使用这些 ticker 生成风险模型的收益率数据。

In [None]:
universe_end_date = pd.Timestamp('2016-01-05', tz='UTC')

universe_tickers = engine\
    .run_pipeline(
        Pipeline(screen=universe),
        universe_end_date,
        universe_end_date)\
    .index.get_level_values(1)\
    .values.tolist()
    
universe_tickers

# 获取收益率数据

In [None]:
from zipline.data.data_portal import DataPortal

data_portal = DataPortal(
    bundle_data.asset_finder,
    trading_calendar=trading_calendar,
    first_trading_day=bundle_data.equity_daily_bar_reader.first_trading_day,
    equity_minute_reader=None,
    equity_daily_reader=bundle_data.equity_daily_bar_reader,
    adjustment_reader=bundle_data.adjustment_reader)

## 获取股价数据的辅助函数

In [None]:
def get_pricing(data_portal, trading_calendar, assets, start_date, end_date, field='close'):
    end_dt = pd.Timestamp(end_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')
    start_dt = pd.Timestamp(start_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')

    end_loc = trading_calendar.closes.index.get_loc(end_dt)
    start_loc = trading_calendar.closes.index.get_loc(start_dt)

    return data_portal.get_history_window(
        assets=assets,
        end_dt=end_dt,
        bar_count=end_loc - start_loc,
        frequency='1d',
        field=field,
        data_frequency='daily')

## 将股价数据放入 dataframe 中

In [None]:
returns_df = \
    get_pricing(
        data_portal,
        trading_calendar,
        universe_tickers,
        universe_end_date - pd.DateOffset(years=5),
        universe_end_date)\
    .pct_change()[1:].fillna(0) #convert prices into returns

returns_df

## 行业数据辅助函数

我们将为你创建一个对象，它会针对每支股票定义一个行业。行业由整数表示。我们继承了 Classifier 类。[Classifier 文档](https://www.quantopian.com/posts/pipeline-classifiers-are-here)以及 [Classifier 的源代码

In [None]:
from zipline.pipeline.classifiers import Classifier
from zipline.utils.numpy_utils import int64_dtype
class Sector(Classifier):
    dtype = int64_dtype
    window_length = 0
    inputs = ()
    missing_value = -1

    def __init__(self):
        self.data = np.load('../../data/project_4_sector/data.npy')

    def _compute(self, arrays, dates, assets, mask):
        return np.where(
            mask,
            self.data[assets],
            self.missing_value,
        )

In [None]:
sector = Sector()

## 我们将使用 2 年的数据计算因子

**注意：**2 年前的日期是休市日期。管道软件包不会处理开始或结束日期是休市的日期。为了解决这个问题，我们再往回推 2 天，这时候市场没有休市。

In [None]:
factor_start_date = universe_end_date - pd.DateOffset(years=2, days=2)
factor_start_date

## 查看 zscore 函数

[这是 zscore 文档](https://www.zipline.io/appendix.html#zipline.pipeline.factors.Factor.zscore)，并且摘抄在下面：

In [None]:
Construct a Factor that Z-Scores each day’s results.

The Z-Score of a row is defined as:

(row - row.mean()) / row.stddev()
If mask is supplied, ignore values where mask returns False when computing row means and standard deviations, and output NaN anywhere the mask is False.

If groupby is supplied, compute by partitioning each row based on the values produced by groupby, z-scoring the partitioned arrays, and stitching the sub-results back together.

Parameters:	
mask (zipline.pipeline.Filter, optional) – A Filter defining values to ignore when Z-Scoring.
groupby (zipline.pipeline.Classifier, optional) – A classifier defining partitions over which to compute Z-Scores.
Returns:	
zscored – A Factor producing that z-scores the output of self.

Return type:	
zipline.pipeline.Factor

## 小测验 1
请创建一个一年收益率因子，已经去均值并排名，然后转换为 z 分数

## 答案 1：

In [None]:
from zipline.pipeline.factors import Returns
#TODO
# create a pipeline called p

# create a factor of one year returns, deman by sector, then rank

# add the factor to the pipeline


## 可视化管道

In [None]:
p.show_graph(format='png')

## 运行管道并查看因子数据

In [None]:
df = engine.run_pipeline(p, factor_start_date, universe_end_date)

In [None]:
df.head()

## 小测验 2
你注意到因子值有什么特征吗？

## 答案 2：

## 解答 notebook
[解答 notebook](zscore_solution.ipynb)