In [151]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
import pdir as pr
import pandas as pd
import os

# 数据集预处理

此次实验数据集来源于[UCI Machine Learning Database][1]，在原始数据集上的基础上，我们用hour.csv中的一部分数据作为训练集（第1-8619条数据，命名为train.csv），一部分数据作为测试集（第16876-17379条数据，命名为test.csv）。

其中，数据集特征命名的解释如下：

```
=========================================
Dataset characteristics
=========================================	

	- instant: record index
	- dteday : date
	- season : season (1:springer, 2:summer, 3:fall, 4:winter)
	- yr : year (0: 2011, 1:2012)
	- mnth : month ( 1 to 12)
	- hr : hour (0 to 23)
	- holiday : weather day is holiday or not 
    (extracted from http://dchr.dc.gov/page/holiday-schedule)
	- weekday : day of the week
	- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
	+ weathersit : 
		- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
		- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
		- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, 
        Light Rain + Scattered clouds
		- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
	- temp : Normalized temperature in Celsius. The values are divided to 41 (max)
	- atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
	- hum: Normalized humidity. The values are divided to 100 (max)
	- windspeed: Normalized wind speed. The values are divided to 67 (max)
	- casual: count of casual users
	- registered: count of registered users
	- cnt: count of total rental bikes including both casual and registered
	


```
[1]:https://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset

## 读取训练集和测试集

In [152]:
trainSet_origin = pd.read_csv('data\\train.csv')
testSet_origin = pd.read_csv('data\\test.csv')
trainSet_origin.shape, testSet_origin.shape

((8619, 15), (504, 15))

## 数据集预览

这里先输出训练集的前5个数据进行数据预览。可以看到，TA给的数据集去除了casual和registered这个两个特征。

In [153]:
trainSet_origin.head(5)

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,2011/1/1,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,16
1,2,2011/1/1,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,40
2,3,2011/1/1,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,32
3,4,2011/1/1,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,13
4,5,2011/1/1,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,1


## 特征预处理以及特征归一化/标准化

结合上面数据集特征命名的解释，我们可总结如下结论：

- **season, mnth, weekday, hr, weathersit, yr, holiday这7个特征都是类别特征(categorical feature)**，这类特征的各个取值并**不存在着取值上的“大小”关系（更严格来说，不存在“序”的关系）**，比如season有4种取值1、2、3、4，但是从该特征的含义上来讲这4种取值之间并没有“序”(order)的关系，因此要使用**独热编码**的方式来讲season特征**替代划分**为4个特征，比如名为season_1、season_2、season_3、season_4，这些引入的变量也称为**哑变量(Dummy variables)**，这样才可以**避免因强行给类别数据增加“序”而带来的许多问题**。

    - 由于yr, holiday只有两个值，依旧用0或者1表示即可。
    
    - season, mnth, weekday, hr, weathersit这5个特征就用**独热编码**的方式**替代划分**为多个特征


- 以下是几个**对训练神经网络没有意义或者重复的特征，这类特征应直接删除掉**。

    - instant特征只是记录的序号，这并不能给我们提供什么信息。
    
    - dteday特征只是记录的日期，时间信息对我们训练神经网络也没有什么意义。
    
    - workingday特征与holiday、weekday这两个特征有重复的地方，不能提供额外的信息。
    
    - atemp特征是受temp特征影响的，其不能提供temp特征所不能提供的信息。


- 除去重复的特征，以下对几个**连续型特征的归一化/标准化**问题的讨论。
    
    - 原始数据集中已经归一化的特征：
    
        - temp特征：代表的是摄氏温度，其通过除以最大值41进行归一化。由于温度的实际最大值应是不会与41度差别太大，这样归一化方式还是合理的。具体考证方法还是要看当地的气候，这个还是不要太较真。
        
        - hum特征：代表湿度，其通过除以最大值100进行归一化，这是完全正确的。
        
        - windspeed特征：代表风速，其通过除以最大值67进行归一化。由于风速的实际最大值应是不会与67差别太大，这样归一化方式推测是合理的。具体考证方法还是要看当地的气候，这个还是不要太较真。
        
    - 原始数据集中未归一化的特征：
    
        - cnt特征：目标预测值，代表预测的一个小时的单车租赁量。这个是没有进行归一化/标准化的特征，所以还是要先进行归一化/标准化比较妥当，而又**由于一个小时的单车租赁量的最大值是不明确的，因此z-score标准差标准化对该特征进行处理，其适用于属性A的最大值和最小值未知的情况，或有超出取值范围的离群数据的情况**。对应转化函数为：$$X^{*}=\frac{X-mean(X)}{std(X)}$$
    
### 特征预处理函数实现

In [154]:
def featurePreprocessing(dataSet):
    '''特征预处理：替代类别特征为哑变量、去除无关特征'''
    categorical_features = ['season', 'mnth', 'weekday', 'hr', 'weathersit']
    useless_features = ['instant', 'dteday', 'workingday', 'atemp']
    features2drop = categorical_features + useless_features 

    for feature in categorical_features:
        #创键 哑变量
        dummies = pd.get_dummies(dataSet[feature], prefix=feature, drop_first=False)
        #合并数据集
        dataSet = pd.concat([dummies, dataSet], axis=1)
        
    #去除类别特征以及无关特征
    return dataSet.drop(features2drop, axis=1)
    
trainSet = featurePreprocessing(trainSet_origin.copy())
testSet = featurePreprocessing(testSet_origin.copy())

### 预处理效果预览

这里先输出预处理后的前5个数据，可见数据的特征数目上涨到了60个。

In [155]:
trainSet.head()

Unnamed: 0,weathersit_1,weathersit_2,weathersit_3,weathersit_4,hr_0,hr_1,hr_2,hr_3,hr_4,hr_5,...,season_1,season_2,season_3,season_4,yr,holiday,temp,hum,windspeed,cnt
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0,0,0.24,0.81,0.0,16
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0,0,0.22,0.8,0.0,40
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0,0,0.22,0.8,0.0,32
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0,0,0.24,0.75,0.0,13
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0,0,0.24,0.75,0.0,1


输出所有的特征名称，可见类别变量均被哑变量替换了，无关变量也被删除了。

In [156]:
trainSet.columns

Index(['weathersit_1', 'weathersit_2', 'weathersit_3', 'weathersit_4', 'hr_0',
       'hr_1', 'hr_2', 'hr_3', 'hr_4', 'hr_5', 'hr_6', 'hr_7', 'hr_8', 'hr_9',
       'hr_10', 'hr_11', 'hr_12', 'hr_13', 'hr_14', 'hr_15', 'hr_16', 'hr_17',
       'hr_18', 'hr_19', 'hr_20', 'hr_21', 'hr_22', 'hr_23', 'weekday_0',
       'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5',
       'weekday_6', 'mnth_1', 'mnth_2', 'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6',
       'mnth_7', 'mnth_8', 'mnth_9', 'mnth_10', 'mnth_11', 'mnth_12',
       'season_1', 'season_2', 'season_3', 'season_4', 'yr', 'holiday', 'temp',
       'hum', 'windspeed', 'cnt'],
      dtype='object')

### 特征归一化/标准化

这里对训练集的特征cnt需要进行标准化，并保存标准化时该特征数据的均值和方差，以供后续预测时还原数据使用。

In [157]:
mean_of_cnt, std_of_cnt = trainSet['cnt'].mean(), trainSet['cnt'].std()
trainSet['cnt'] = (trainSet['cnt'] - mean_of_cnt)/std_of_cnt

In [158]:
trainSet['cnt'].describe()

count    8.619000e+03
mean     2.658017e-17
std      1.000000e+00
min     -1.067540e+00
25%     -8.434575e-01
50%     -2.608435e-01
75%      5.010363e-01
max      3.787577e+00
Name: cnt, dtype: float64

### 保存预处理后的数据集

In [159]:
dirPath = "proprocessed data"
if not os.path.exists(dirPath):
    os.mkdir(dirPath)
    
trainSet.to_csv(dirPath + '\\train.csv')
testSet.to_csv(dirPath + '\\test.csv')

## 从训练集中划分验证集

典型的从训练集中划分验证集的方法是：划分训练集中的30%为验证集，划分过程采用随机选取的方式。但是这里**由于数据时有时间序列特性的，因此便采取划分训练集的后30%为验证集**。

In [170]:
trainSet_backup = trainSet.copy()

In [174]:
splitRate = 0.3 #划分比例
splitNum = int(trainSet_backup.shape[0]*splitRate) #划分的数目
#划分 数据集 和 标签
temp = trainSet_backup.drop(['cnt'], axis=1), trainSet_backup['cnt']
#得到 训练集 和 标签
trainSet, trainSet_label = temp[0][:-splitNum], temp[1][:-splitNum] 
#得到 验证集 和 标签
validateSet, validateSet_label = temp[0][-splitNum:0], temp[1][-splitNum:0] 
print('划分后的训练集和验证集的维度：', trainSet.shape,  validateSet.shape)

划分后的训练集和验证集的维度： (6034, 56) (0, 56)


# 参考资料

- 1.[用python参加Kaggle的些许经验总结][用python参加Kaggle的些许经验总结]
- 2.[Pandas 处理 dummy variable][Pandas 处理 dummy variable]
- 3.[数据标准化/归一化normalization][数据标准化/归一化normalization]

[用python参加Kaggle的些许经验总结]:http://www.jianshu.com/p/32def2294ae6
[Pandas 处理 dummy variable]:http://blog.csdn.net/weiwei9363/article/details/78255210
[数据标准化/归一化normalization]:http://blog.csdn.net/pipisorry/article/details/52247379