## 第二周：特征工程
__author__ = 'hehuihui@caicloud.io'

In [1]:
from __future__ import division

import numpy as np
import pandas as pd
import talib

# 设置屏幕最大可显示的pandas行数与列数，方便查看数据
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

#### 读取数据

In [2]:
df = pd.read_csv("../data/000002.csv")
df.drop('code', axis=1, inplace=True)

#### a)	前5、10、20、40个交易日内的平均成交量、上涨时的平均成交量、下跌时的平均成交量


In [3]:
# 计算前5、10、20、40个交易日内的平均成交量
for w in [5, 10, 20, 40]:
    column = 'volume-mean-%d' % (w)
    df[column] = df['volume'].rolling(window=w).mean()
# print df

In [4]:
# 计算当天的涨跌幅
pct_change = df['close'].pct_change().fillna(0)

# 计算当天是否为涨，若为涨则取成交量，否则设为nan
condition_up = pct_change.apply(lambda x: 1 if x > 0 else 0)
volume_up = df['volume'] * condition_up

# 计算当天是否为跌，若为跌则取成交量，否则设为nan
condition_down = pct_change.apply(lambda x: 1 if x <= 0 else 0)
volume_down = df['volume'] * condition_down

In [5]:
def get_volume_mean(volume, condition, window):
    ''' 统计前N日上涨或下跌时的平均成交量
    Args:
    -------------------------------------------
        volume: pd.Series, 上涨(或下跌)时的成交量，若当天不是上涨(或下跌)，则成交量为np.nan
        condition: pd.Series, 当天是否为上涨(或下跌)
        window: int, 时间窗口 N=(5, 10, 20, 40)
    Returns:
    -------------------------------------------
        volume_mean: pd.Series, 前N日上涨(或下跌)时的平均成交量
    '''
    # 平均成交量，最开始的N天设为nan
    volume_mean = [np.nan] * window
    # 遍历每一天，计算前N天的平均成交量
    for i in range(window, len(volume)):
        # 前N天内上涨或下跌的天数（注意：num可能为0）
        num = sum(condition[i-window:i])
        # 前N天内上涨或下跌的平均成交量
        value = np.sum(volume[i-window:i]) / max(1, num)
        volume_mean.append(value) 
    return volume_mean


# 计算前5、10、20、40个交易日内，上涨或下跌时的平均成交量
for w in [5, 10, 20, 40]:
    # 上涨时的平均成交量
    column = 'volume-up-mean-%d' % (w)
    df[column] = get_volume_mean(volume_up, condition_up, w)
    
    # 下跌时的平均成交量
    column = 'volume-down-mean-%d' % (w)
    df[column] = get_volume_mean(volume_down, condition_down, w)

# print df

#### b)	前5、10、20、40个交易日区间内的(收盘价-开盘价)/开盘价、(最高价-开盘价) /开盘价、(最低价-开盘价) /开盘价、(最高价-收盘价) /开盘价、(最低价-收盘价) /开盘价


In [6]:
for w in [5, 10, 20, 40]:
    df['close-open-%d' % w] = (df['close'] - df['open']) / df['open']
    df['high-open-%d' % w] = (df['high'] - df['open']) / df['open']
    df['low-open-%d' % w] = (df['low'] - df['open']) / df['open']
    df['high-close-%d' % w] = (df['high'] - df['close']) / df['open']
    df['low-close-%d' % w] = (df['low'] - df['close']) / df['open']

#### c) 前1、3、5、10个交易日内，[(最高价+最低价)-(开盘价+收盘价)] * 成交量


In [7]:
# 此指标衡量的是资金流入与流出之差（近似）
for w in [1, 3, 5, 10]:
    acc_amount = (df['high'] + df['low'] - df['open'] - df['close']) * df['volume']
    df['acc-amount-%d' % w] = acc_amount.rolling(window=w).sum()

#### d) 使用talib生成MACD、KDJ、RSI、EMV指标


In [8]:
# MACD
df['MACD'], df['MACD-signal'], df['MACD-hist'] = \
    talib.MACD(df['close'].values, fastperiod=12, slowperiod=26, signalperiod=9)

In [9]:
# KDJ
K, D = talib.STOCH(df['high'].values, df['low'].values, df['close'].values, 
                       fastk_period=9, slowk_period=3, slowd_period=3)
J = 3*K - 2*D

# KDJ
df['KDJ-K'] = K
df['KDJ-D'] = D
df['KDJ-J'] = J

In [10]:
# RSI
for period in [6, 12, 24]:
    df['RSI-%d' % (period)] = talib.RSI(df['close'].values, timeperiod=period)

In [11]:
# EMV

# 计算公式(https://www.joinquant.com/post/150)
# 1.A=（今日最高+今日最低）/2
#   B=（前日最高+前日最低）/2
#   C=今日最高-今日最低
# 2.EM=（A-B）*C/今日成交额
# 3.EMV=N日内EM的累和

diff = (df['high'] + df['low']) - (df['high'].shift(1) + df['low'].shift(1))
mv = 1e8 * diff * (df['high'] - df['low']) / df['volume']
df['EMV'] = mv.rolling(window=14).mean()
df['MAEMV'] = df['EMV'].rolling(window=9).mean()

## 特征选择

在机器学习的实际应用中，特征数量往往较多，其中可能存在不相关的特征，特征之间也可能存在相互依赖，容易导致：<br/>
- 特征个数越多，分析特征、训练模型所需的时间也就越长
- 特征个数越多，容易引起“维度灾难”，模型也会越复杂，其推广能力会下降。

特征选择能剔除不相关（irrelevant）或亢余（redundant）的特征，从而达到减少特征个数，提高模型精确度，减少运行时间的目的。<br/>
另一方面，选取出真正相关的特征简化了模型，是研究人员易于理解数据产生的过程

In [12]:
# 生成标签
# 注意此处的移位是-1，label是未来一天的收盘价➗当天的收盘价
df['label'] = (df['close'].shift(-1) / df['close']).apply(lambda x: 1 if x > 1 else 0)

In [13]:
# 过滤缺失值
new_df = df.dropna(how='any').reset_index(drop=True)

# 获取特征与标签
X = new_df.iloc[:, 1:-1]
y = new_df.iloc[:, -1]

In [14]:
# 特征归一化
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_transformed = scaler.fit_transform(X)

#### 卡方统计量

In [15]:
# 对于分类问题，可使用的特征选择方法: chi2, f_classif, mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif

X_selected = SelectKBest(chi2, k=30).fit_transform(X_transformed, y)
# print X_selected

  if np.rank(self.data) != 1 or np.rank(self.indices) != 1 or np.rank(self.indptr) != 1:
  if np.rank(self.data) != 1 or np.rank(self.row) != 1 or np.rank(self.col) != 1:


#### Pearson相关系数

In [16]:
pearson_list = list()
for column in X.columns:
    pearson = new_df['label'].corr(new_df[column])
    pearson_list.append([column, pearson])

pd.DataFrame(pearson_list).head()

Unnamed: 0,0,1
0,open,-0.03247
1,close,-0.031257
2,high,-0.03228
3,low,-0.031061
4,volume,-0.01578


#### Lasso L1正则化

In [17]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.001)
lasso.fit(X_transformed, y)

pd.DataFrame(zip(X.columns, lasso.coef_))

Unnamed: 0,0,1
0,open,-0.02140302
1,close,-0.0
2,high,-0.0
3,low,-0.0
4,volume,-0.0
5,volume-mean-5,-0.0
6,volume-mean-10,0.0
7,volume-mean-20,0.0
8,volume-mean-40,0.0
9,volume-up-mean-5,-0.0


#### 随机森林特征重要性

In [18]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_transformed, y)

pd.DataFrame(zip(X.columns, rfc.feature_importances_))

Unnamed: 0,0,1
0,open,0.025245
1,close,0.012611
2,high,0.013329
3,low,0.018233
4,volume,0.027845
5,volume-mean-5,0.019657
6,volume-mean-10,0.025552
7,volume-mean-20,0.01372
8,volume-mean-40,0.021555
9,volume-up-mean-5,0.019821
