# 特征工程小案例
Kaggle上有这样一个比赛：[城市自行车共享系统使用状况](https://www.kaggle.com/c/bike-sharing-demand)。

提供的数据为2年内按小时做的自行车租赁数据，其中训练集由每个月的前19天组成，测试集由20号之后的时间组成。

In [24]:
#先把数据读进来
import pandas as pd
data = pd.read_csv('kaggle_bike_competition_train.csv', header = 0, error_bad_lines=False)

In [25]:
#看一眼数据长什么样
data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011/1/1 0:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011/1/1 1:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011/1/1 2:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011/1/1 3:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011/1/1 4:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


### 把datetime域切成 日期 和 时间 两部分。

In [5]:
# 处理时间字段
temp = pd.DatetimeIndex(data['datetime'])
data['date'] = temp.date
data['time'] = temp.time
data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,date,time
0,2011/1/1 0:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011-01-01,00:00:00
1,2011/1/1 1:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011-01-01,01:00:00
2,2011/1/1 2:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011-01-01,02:00:00
3,2011/1/1 3:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011-01-01,03:00:00
4,2011/1/1 4:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011-01-01,04:00:00


### 时间那部分，好像最细的粒度也只到小时，所以我们干脆把小时字段拿出来作为更简洁的特征。

In [6]:
# 设定hour这个小时字段
data['hour'] = pd.to_datetime(data.time, format="%H:%M:%S")
data['hour'] = pd.Index(data['hour']).hour
data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,date,time,hour
0,2011/1/1 0:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16,2011-01-01,00:00:00,0
1,2011/1/1 1:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40,2011-01-01,01:00:00,1
2,2011/1/1 2:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32,2011-01-01,02:00:00,2
3,2011/1/1 3:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13,2011-01-01,03:00:00,3
4,2011/1/1 4:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1,2011-01-01,04:00:00,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012/12/19 19:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336,2012-12-19,19:00:00,19
10882,2012/12/19 20:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241,2012-12-19,20:00:00,20
10883,2012/12/19 21:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168,2012-12-19,21:00:00,21
10884,2012/12/19 22:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129,2012-12-19,22:00:00,22


### 仔细想想，数据只告诉我们是哪天了，按照一般逻辑，应该周末和工作日出去的人数量不同吧。我们设定一个新的字段dayofweek表示是一周中的第几天。再设定一个字段dateDays表示离第一天开始租车多久了(猜测在欧美国家，这种绿色环保的出行方式，会迅速蔓延吧)

In [7]:
# 我们对时间类的特征做处理，产出一个星期几的类别型变量
data['dayofweek'] = pd.DatetimeIndex(data.date).dayofweek

# 对时间类特征处理，产出一个时间长度变量
data['dateDays'] = (data.date - data.date[0]).astype('timedelta64[D]')

data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,date,time,hour,dayofweek,dateDays
0,2011/1/1 0:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16,2011-01-01,00:00:00,0,5,0.0
1,2011/1/1 1:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40,2011-01-01,01:00:00,1,5,0.0
2,2011/1/1 2:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32,2011-01-01,02:00:00,2,5,0.0
3,2011/1/1 3:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13,2011-01-01,03:00:00,3,5,0.0
4,2011/1/1 4:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1,2011-01-01,04:00:00,4,5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012/12/19 19:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336,2012-12-19,19:00:00,19,2,718.0
10882,2012/12/19 20:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241,2012-12-19,20:00:00,20,2,718.0
10883,2012/12/19 21:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168,2012-12-19,21:00:00,21,2,718.0
10884,2012/12/19 22:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129,2012-12-19,22:00:00,22,2,718.0


### 其实我们刚才一直都在猜测，并不知道真实的日期相关的数据分布对吧，所以我们要做一个小小的统计来看看真实的数据分布，我们统计一下一周各天的自行车租赁情况(分注册的人和没注册的人)

In [8]:
byday = data.groupby('dayofweek')
# 统计下没注册的用户租赁情况
byday['casual'].sum().reset_index()

Unnamed: 0,dayofweek,casual
0,0,46288
1,1,35365
2,2,34931
3,3,37283
4,4,47402
5,5,100782
6,6,90084


In [9]:
# 统计下注册的用户的租赁情况
byday['registered'].sum().reset_index()

Unnamed: 0,dayofweek,registered
0,0,249008
1,1,256620
2,2,257295
3,3,269118
4,4,255102
5,5,210736
6,6,195462


### 周末既然有不同，就单独拿一列出来给星期六，再单独拿一列出来给星期日

In [10]:
data['Saturday']=0
data.Saturday[data.dayofweek==5]=1

data['Sunday']=0
data.Sunday[data.dayofweek==6]=1

data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,date,time,hour,dayofweek,dateDays,Saturday,Sunday
0,2011/1/1 0:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16,2011-01-01,00:00:00,0,5,0.0,1,0
1,2011/1/1 1:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40,2011-01-01,01:00:00,1,5,0.0,1,0
2,2011/1/1 2:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32,2011-01-01,02:00:00,2,5,0.0,1,0
3,2011/1/1 3:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13,2011-01-01,03:00:00,3,5,0.0,1,0
4,2011/1/1 4:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1,2011-01-01,04:00:00,4,5,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012/12/19 19:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336,2012-12-19,19:00:00,19,2,718.0,0,0
10882,2012/12/19 20:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241,2012-12-19,20:00:00,20,2,718.0,0,0
10883,2012/12/19 21:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168,2012-12-19,21:00:00,21,2,718.0,0,0
10884,2012/12/19 22:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129,2012-12-19,22:00:00,22,2,718.0,0,0


### 从数据中，把原始的时间字段等踢掉

In [11]:
# remove old data features
dataRel = data.drop(['datetime', 'count','date','time','dayofweek'], axis=1)
dataRel.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,hour,dateDays,Saturday,Sunday
0,1,0,0,1,9.84,14.395,81,0.0,3,13,0,0.0,1,0
1,1,0,0,1,9.02,13.635,80,0.0,8,32,1,0.0,1,0
2,1,0,0,1,9.02,13.635,80,0.0,5,27,2,0.0,1,0
3,1,0,0,1,9.84,14.395,75,0.0,3,10,3,0.0,1,0
4,1,0,0,1,9.84,14.395,75,0.0,0,1,4,0.0,1,0


### 特征向量化
我们这里打算用scikit-learn来建模。对于pandas的dataframe我们有方法/函数可以直接转成python中的dict。
另外，在这里我们要对离散值和连续值特征区分一下了，以便之后分开做不同的特征处理。

In [12]:
from sklearn.feature_extraction import DictVectorizer
# 我们把连续值的属性放入一个dict中
featureConCols = ['temp','atemp','humidity','windspeed','dateDays','hour']
dataFeatureCon = dataRel[featureConCols]
dataFeatureCon = dataFeatureCon.fillna( 'NA' ) #in case I missed any
X_dictCon = dataFeatureCon.T.to_dict().values() 

# 把离散值的属性放到另外一个dict中
featureCatCols = ['season','holiday','workingday','weather','Saturday', 'Sunday']
dataFeatureCat = dataRel[featureCatCols]
dataFeatureCat = dataFeatureCat.fillna( 'NA' ) #in case I missed any
X_dictCat = dataFeatureCat.T.to_dict().values() 

# 向量化特征
vec = DictVectorizer(sparse = False)
X_vec_cat = vec.fit_transform(X_dictCat)
X_vec_con = vec.fit_transform(X_dictCon)

In [14]:
dataFeatureCon.head()

Unnamed: 0,temp,atemp,humidity,windspeed,dateDays,hour
0,9.84,14.395,81,0.0,0.0,0
1,9.02,13.635,80,0.0,0.0,1
2,9.02,13.635,80,0.0,0.0,2
3,9.84,14.395,75,0.0,0.0,3
4,9.84,14.395,75,0.0,0.0,4


In [15]:
X_vec_con

array([[ 14.395 ,   0.    ,   0.    ,  81.    ,   9.84  ,   0.    ],
       [ 13.635 ,   0.    ,   1.    ,  80.    ,   9.02  ,   0.    ],
       [ 13.635 ,   0.    ,   2.    ,  80.    ,   9.02  ,   0.    ],
       ...,
       [ 15.91  , 718.    ,  21.    ,  61.    ,  13.94  ,  15.0013],
       [ 17.425 , 718.    ,  22.    ,  61.    ,  13.94  ,   6.0032],
       [ 16.665 , 718.    ,  23.    ,  66.    ,  13.12  ,   8.9981]])

In [16]:
dataFeatureCat.head()

Unnamed: 0,season,holiday,workingday,weather,Saturday,Sunday
0,1,0,0,1,1,0
1,1,0,0,1,1,0
2,1,0,0,1,1,0
3,1,0,0,1,1,0
4,1,0,0,1,1,0


In [17]:
X_vec_cat

array([[1., 0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 1., 0.],
       ...,
       [0., 0., 0., 4., 1., 1.],
       [0., 0., 0., 4., 1., 1.],
       [0., 0., 0., 4., 1., 1.]])

### 标准化连续值特征
我们要对连续值属性做一些处理，最基本的当然是标准化，让连续值属性处理过后均值为0，方差为1。
这样的数据放到模型里，对模型训练的收敛和模型的准确性都有好处

In [18]:
from sklearn import preprocessing
# 标准化连续值数据
scaler = preprocessing.StandardScaler().fit(X_vec_con)
X_vec_con = scaler.transform(X_vec_con)
X_vec_con

array([[-1.09273697, -1.70912256, -1.66894356,  0.99321305, -1.33366069,
        -1.56775367],
       [-1.18242083, -1.70912256, -1.52434128,  0.94124921, -1.43890721,
        -1.56775367],
       [-1.18242083, -1.70912256, -1.379739  ,  0.94124921, -1.43890721,
        -1.56775367],
       ...,
       [-0.91395927,  1.70183906,  1.36770431, -0.04606385, -0.80742813,
         0.26970368],
       [-0.73518157,  1.70183906,  1.51230659, -0.04606385, -0.80742813,
        -0.83244247],
       [-0.82486544,  1.70183906,  1.65690887,  0.21375537, -0.91267464,
        -0.46560752]])

### 类别特征编码
最常用的当然是one-hot编码咯，比如颜色 红、蓝、黄 会被编码为[1, 0, 0]，[0, 1, 0]，[0, 0, 1]

In [19]:
from sklearn import preprocessing
# one-hot编码
enc = preprocessing.OneHotEncoder()
enc.fit(X_vec_cat)
X_vec_cat = enc.transform(X_vec_cat).toarray()
X_vec_cat

array([[0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [1., 0., 1., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.]])

### 把特征拼一起
把离散和连续的特征都组合在一起

In [20]:
import numpy as np
# combine cat & con features
X_vec = np.concatenate((X_vec_con,X_vec_cat), axis=1)
X_vec

array([[-1.09273697, -1.70912256, -1.66894356, ...,  0.        ,
         1.        ,  0.        ],
       [-1.18242083, -1.70912256, -1.52434128, ...,  0.        ,
         1.        ,  0.        ],
       [-1.18242083, -1.70912256, -1.379739  , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.91395927,  1.70183906,  1.36770431, ...,  0.        ,
         0.        ,  1.        ],
       [-0.73518157,  1.70183906,  1.51230659, ...,  0.        ,
         0.        ,  1.        ],
       [-0.82486544,  1.70183906,  1.65690887, ...,  0.        ,
         0.        ,  1.        ]])

最后的特征，前6列是标准化过后的连续值特征，后面是编码后的离散值特征

### 对结果值也处理一下
拿到结果的浮点数值

In [21]:
# 对Y向量化
Y_vec_reg = dataRel['registered'].values.astype(float)
Y_vec_cas = dataRel['casual'].values.astype(float)

In [22]:
Y_vec_reg

array([ 13.,  32.,  27., ..., 164., 117.,  84.])

In [23]:
Y_vec_cas

array([ 3.,  8.,  5., ...,  4., 12.,  4.])