## 三、数据预处理
__author__ = 'hehuihui@caicloud.io'

In [1]:
from __future__ import division

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

#### 读取数据

In [2]:
df = pd.read_csv('../data/000002.csv')
df.drop('code', axis=1, inplace=True)

#### a) 数据校验
####     - 检查数据中是否有缺失值，过滤缺失值；


In [17]:
# 查找每列是否有空值（结果都没有）
print df.isnull().any()
print '\n'
print df.isnull().all()

date          False
open          False
close         False
high          False
low           False
volume        False
pct_change     True
dtype: bool


date          False
open          False
close         False
high          False
low           False
volume        False
pct_change    False
dtype: bool


In [25]:
# 加入一个空值
df['pct_change'] = df['close'].pct_change() * 100
df[df.isnull().values == True]

Unnamed: 0,date,open,close,high,low,volume,pct_change
0,2012-10-08,7.011,6.835,7.095,6.777,473161.0,


In [26]:
# 过滤缺失值
df.dropna(how='any', axis=0).head()

Unnamed: 0,date,open,close,high,low,volume,pct_change
1,2012-10-09,6.86,7.003,7.028,6.852,487942.0,2.457937
2,2012-10-10,7.011,7.003,7.011,6.869,507343.0,0.0
3,2012-10-11,6.978,6.944,6.978,6.885,254369.0,-0.842496
4,2012-10-12,6.969,6.902,7.02,6.86,311356.0,-0.604839
5,2012-10-15,6.919,6.86,6.919,6.818,213705.0,-0.608519


In [27]:
# 填充缺失值(采用bfill后向填充，以后边第一个值填充)
df.fillna(method='bfill').head()

Unnamed: 0,date,open,close,high,low,volume,pct_change
0,2012-10-08,7.011,6.835,7.095,6.777,473161.0,2.457937
1,2012-10-09,6.86,7.003,7.028,6.852,487942.0,2.457937
2,2012-10-10,7.011,7.003,7.011,6.869,507343.0,0.0
3,2012-10-11,6.978,6.944,6.978,6.885,254369.0,-0.842496
4,2012-10-12,6.969,6.902,7.02,6.86,311356.0,-0.604839


#### - 检查数据中是否有涨跌幅>10.5%的值

In [29]:
index = df['pct_change'].apply(lambda x: x > 10.5 or x < -10.5)
df.loc[index, :]

Unnamed: 0,date,open,close,high,low,volume,pct_change


#### b) 数据归一化：使用Scaler对数据进行归一化和还原（反归一化）

In [35]:
# http://scikit-learn.org/stable/modules/preprocessing.html
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# 取开盘价、收盘价、最高价、最低价
X = df.loc[:, 'open':'volume'].values

# 定义归一化器
scaler = MinMaxScaler()
# 归一化
X_transformed = scaler.fit_transform(df_values)
# 反归一化
X_inverse_transformed = scaler.inverse_transform(X_transformed)

#print X_transformed
#pd.DataFrame(X_inverse_transformed)

#### c) 交叉验证：使用Cross Validation对股票时间序列数据做交叉验证，生成训练集和测试集

In [45]:
# http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-of-time-series-data
# 时间序列的交叉验证有些特别，需要考虑时间先后顺序
# 测试集的时间要比训练集晚

from sklearn.model_selection import TimeSeriesSplit

# 使用TimeSeriesSplit对时间序列进行交叉划分，CV次数为3
tscv = TimeSeriesSplit(n_splits=3)

# 获取训练集和测试集的索引
for train, test in tscv.split(X):
    print("%s %s" % (train[:10], train[-10:]))
    print("%s %s\n" % (test[:10], test[-10:]))

# 训练集和测试集
X_train = X[train]
X_test = X[test]

[0 1 2 3 4 5 6 7 8 9] [258 259 260 261 262 263 264 265 266 267]
[268 269 270 271 272 273 274 275 276 277] [524 525 526 527 528 529 530 531 532 533]

[0 1 2 3 4 5 6 7 8 9] [524 525 526 527 528 529 530 531 532 533]
[534 535 536 537 538 539 540 541 542 543] [790 791 792 793 794 795 796 797 798 799]

[0 1 2 3 4 5 6 7 8 9] [790 791 792 793 794 795 796 797 798 799]
[800 801 802 803 804 805 806 807 808 809] [1056 1057 1058 1059 1060 1061 1062 1063 1064 1065]

