In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
import pdir as pr
import pandas as pd
import os
from tqdm import tqdm, tnrange, tqdm_notebook

DF = pd.DataFrame

# 读取训练集和测试集

In [2]:
trainSet_origin = pd.read_csv('data\\train.csv')
testSet_origin = pd.read_csv('data\\test.csv')

trainSet_origin.shape, testSet_origin.shape

((16659, 9), (742, 9))

In [3]:
trainSet_origin.head()

Unnamed: 0,instant,dteday,hr,weathersit,temp,atemp,hum,windspeed,cnt
0,1,2011/1/1,0,1,9.84,14.395,81,0,16
1,2,2011/1/1,1,1,9.02,13.635,80,0,40
2,3,2011/1/1,2,1,9.02,13.635,80,0,32
3,4,2011/1/1,3,1,9.84,14.395,75,0,13
4,5,2011/1/1,4,1,9.84,14.395,75,0,1


In [4]:
testSet_origin.head()

Unnamed: 0,instant,dteday,hr,weathersit,temp,atemp,hum,windspeed,cnt
0,16638,2012/12/1,0,1,10.66,15.15,81,0.0,?
1,16639,2012/12/1,1,1,10.66,15.15,81,0.0,?
2,16640,2012/12/1,2,2,10.66,15.15,81,0.0,?
3,16641,2012/12/1,3,2,10.66,13.635,81,8.9981,?
4,16642,2012/12/1,4,1,10.66,14.395,81,6.0032,?


# 去除异常数据

## 去除xxxx/xx/00类数据

用excel打开训练集，发现有22条异常数据，这类数据都有一个特点，那就是dteday的具体是xxxx/xx/00这样的形式的。测试集则没有这种数据。

In [5]:
t = trainSet_origin[trainSet_origin.dteday.str.contains('/00')]
t
t.shape

Unnamed: 0,instant,dteday,hr,weathersit,temp,atemp,hum,windspeed,cnt
688,0,2011/1/00,0,0,0,0.0,0,0,38189
1338,0,2011/2/00,0,0,0,0.0,0,0,48215
2069,0,2011/3/00,0,0,0,0.0,0,0,64045
2789,0,2011/4/00,0,0,0,0.0,0,0,94870
3534,0,2011/5/00,0,0,0,0.0,0,0,135821
4255,0,2011/6/00,0,0,0,0.0,0,0,143512
5000,0,2011/7/00,0,0,0,0.0,0,0,143512
5732,0,2011/8/00,0,0,0,0.0,0,0,136691
6450,0,2011/9/00,0,0,0,0.0,0,0,127418
7194,0,2011/10/00,0,0,0,0.0,0,0,123511


(22, 9)

In [6]:
trainSet_origin.shape
trainSet_origin = trainSet_origin[~trainSet_origin.dteday.str.contains('/00')]
trainSet_origin.shape

(16659, 9)

(16637, 9)

## 去除带有缺失值的数据

训练集中有些数据是用？来代替缺失值的，用excel查看后发现有22处缺失值，由于整个数据集比较大，所以把这些数据直接删除掉也无所谓。

In [7]:
trainSet_origin.columns

Index(['instant', 'dteday', 'hr', 'weathersit', 'temp', 'atemp', 'hum',
       'windspeed', 'cnt'],
      dtype='object')

In [8]:
trainSet_origin[trainSet_origin.dteday == "?"]
trainSet_origin[trainSet_origin.hr == "?"]
trainSet_origin[trainSet_origin.weathersit == "?"]
trainSet_origin[trainSet_origin.temp == "?"]
trainSet_origin[trainSet_origin.atemp == "?"]
trainSet_origin[trainSet_origin.hum == "?"]
trainSet_origin[trainSet_origin.windspeed == "?"]

Unnamed: 0,instant,dteday,hr,weathersit,temp,atemp,hum,windspeed,cnt


Unnamed: 0,instant,dteday,hr,weathersit,temp,atemp,hum,windspeed,cnt
10751,10737,2012/3/28,?,1,20.5,24.24,42,26.0027,222
14416,14398,2012/8/28,?,1,27.88,31.82,83,16.9979,15


Unnamed: 0,instant,dteday,hr,weathersit,temp,atemp,hum,windspeed,cnt
304,305,2011/1/14,3,?,4.1,6.82,54,7.0015,1
3669,3665,2011/6/6,14,?,31.98,34.09,31,8.9981,145
15813,15793,2012/10/25,5,?,21.32,25.0,88,11.0014,55


Unnamed: 0,instant,dteday,hr,weathersit,temp,atemp,hum,windspeed,cnt
218,219,2011/1/10,9,2,?,6.06,50,16.9979,94
9783,9770,2012/2/17,0,2,?,16.665,87,0.0,34
13954,13936,2012/8/8,20,1,?,34.85,66,12.998,500
16143,16122,2012/11/9,11,1,?,20.455,47,19.9995,251


Unnamed: 0,instant,dteday,hr,weathersit,temp,atemp,hum,windspeed,cnt
479,480,2011/1/22,0,1,1.64,?,45,16.9979,13
6097,6090,2011/9/16,7,1,16.4,?,71,19.0012,299
8817,8805,2012/1/7,16,2,22.96,?,37,19.0012,401
9038,9026,2012/1/16,22,2,12.3,?,49,23.9994,43
10596,10582,2012/3/22,0,1,21.32,?,83,6.0032,41
11992,11977,2012/5/19,5,1,18.86,?,67,0.0,12
13726,13709,2012/7/30,9,2,30.34,?,70,0.0,293


Unnamed: 0,instant,dteday,hr,weathersit,temp,atemp,hum,windspeed,cnt
5255,5249,2011/8/11,14,1,32.8,34.85,?,0.0,142
14101,14083,2012/8/14,23,1,29.52,34.09,?,11.0014,167
14672,14653,2012/9/7,17,1,31.98,37.12,?,22.0028,772


Unnamed: 0,instant,dteday,hr,weathersit,temp,atemp,hum,windspeed,cnt
355,356,2011/1/16,7,2,9.84,10.605,56,?,3
12581,12565,2012/6/12,17,2,27.06,29.545,89,?,681
15460,15440,2012/10/10,12,1,22.14,25.76,56,?,390


In [9]:
trainSet_origin.shape
trainSet_origin = trainSet_origin.replace("?", np.NAN)
trainSet_origin = trainSet_origin.dropna()
trainSet_origin.shape

(16637, 9)

(16615, 9)

# 去除无关特征

In [10]:
trainSet_origin.head()
testSet_origin.head()

Unnamed: 0,instant,dteday,hr,weathersit,temp,atemp,hum,windspeed,cnt
0,1,2011/1/1,0,1,9.84,14.395,81,0,16
1,2,2011/1/1,1,1,9.02,13.635,80,0,40
2,3,2011/1/1,2,1,9.02,13.635,80,0,32
3,4,2011/1/1,3,1,9.84,14.395,75,0,13
4,5,2011/1/1,4,1,9.84,14.395,75,0,1


Unnamed: 0,instant,dteday,hr,weathersit,temp,atemp,hum,windspeed,cnt
0,16638,2012/12/1,0,1,10.66,15.15,81,0.0,?
1,16639,2012/12/1,1,1,10.66,15.15,81,0.0,?
2,16640,2012/12/1,2,2,10.66,15.15,81,0.0,?
3,16641,2012/12/1,3,2,10.66,13.635,81,8.9981,?
4,16642,2012/12/1,4,1,10.66,14.395,81,6.0032,?


测试集的最后一列也没有用，直接删掉。

In [11]:
useless_features = ['instant'] #, 'atemp'
trainSet_origin.drop(useless_features, axis=1, inplace=True)
testSet_origin.drop(useless_features, axis=1, inplace=True)

In [12]:
trainSet_origin.head()
testSet_origin.head()

Unnamed: 0,dteday,hr,weathersit,temp,atemp,hum,windspeed,cnt
0,2011/1/1,0,1,9.84,14.395,81,0,16
1,2011/1/1,1,1,9.02,13.635,80,0,40
2,2011/1/1,2,1,9.02,13.635,80,0,32
3,2011/1/1,3,1,9.84,14.395,75,0,13
4,2011/1/1,4,1,9.84,14.395,75,0,1


Unnamed: 0,dteday,hr,weathersit,temp,atemp,hum,windspeed,cnt
0,2012/12/1,0,1,10.66,15.15,81,0.0,?
1,2012/12/1,1,1,10.66,15.15,81,0.0,?
2,2012/12/1,2,2,10.66,15.15,81,0.0,?
3,2012/12/1,3,2,10.66,13.635,81,8.9981,?
4,2012/12/1,4,1,10.66,14.395,81,6.0032,?


# 去除重复数据

In [13]:
trainSet_origin.drop_duplicates(inplace=True)
trainSet_origin.shape

(16615, 8)

# 划分dteday特征

dteday特征包含年月日信息，可能对算法有用，因此打算将该特征划分为三个特征。

首先看一下训练集和测试集在dteday这个特征的数据分布。

In [14]:
def test_dteday(df):
    date = list(df.dteday.str.split("/"))

    year = [int(i[0]) for i in date]
    month = [int(i[1]) for i in date]
    day = [int(i[2]) for i in date]

    print(list(Counter(year).keys()))
    print(list(Counter(month).keys()))
    print(list(Counter(day).keys()))
    
    
test_dteday(trainSet_origin)
'---------'
test_dteday(testSet_origin)

[2011, 2012]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]


'---------'

[2012]
[12]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]


从这里也可以看到，测试中数据只包含年份2012和月份12的数据，也就是说，现在年份这个特征是肯定没有用的。

但是月份这个特征还待考量。虽然测试集上只有12月的数据，但是在使用验证集测试参数时可能会使用到月份数据，因此还是保留月份。

至于日这个特征，这里将其删除。

In [15]:
date = list(trainSet_origin.dteday.str.split("/"))

month = [int(i[1]) for i in date]

trainSet_origin.insert(1, 'month', month)
trainSet_origin.drop(['dteday'], axis=1, inplace=True)

trainSet_origin.shape
trainSet_origin.head()

(16615, 8)

Unnamed: 0,month,hr,weathersit,temp,atemp,hum,windspeed,cnt
0,1,0,1,9.84,14.395,81,0,16
1,1,1,1,9.02,13.635,80,0,40
2,1,2,1,9.02,13.635,80,0,32
3,1,3,1,9.84,14.395,75,0,13
4,1,4,1,9.84,14.395,75,0,1


In [16]:
date = list(testSet_origin.dteday.str.split("/"))

month = [int(i[1]) for i in date]
day = [int(i[2]) for i in date]

testSet_origin.insert(1, 'month', month)
testSet_origin.drop(['dteday'], axis=1, inplace=True)

testSet_origin.shape
testSet_origin.head()

(742, 8)

Unnamed: 0,month,hr,weathersit,temp,atemp,hum,windspeed,cnt
0,12,0,1,10.66,15.15,81,0.0,?
1,12,1,1,10.66,15.15,81,0.0,?
2,12,2,2,10.66,15.15,81,0.0,?
3,12,3,2,10.66,13.635,81,8.9981,?
4,12,4,1,10.66,14.395,81,6.0032,?


# 合并训练集和测试集

由于后续需要对训练集和测试集都进行同样的数据处理，在处理过程中可能会涉及到数据集维度的变化，因此这里事先把两个数据集合并起来，以免后续因为维度不统一而出现问题。

In [17]:
allDataSet_origin = pd.concat([trainSet_origin, testSet_origin], axis=0)
allDataSet_origin.shape
allDataSet_origin.head()

(17357, 8)

Unnamed: 0,month,hr,weathersit,temp,atemp,hum,windspeed,cnt
0,1,0,1,9.84,14.395,81,0,16
1,1,1,1,9.02,13.635,80,0,40
2,1,2,1,9.02,13.635,80,0,32
3,1,3,1,9.84,14.395,75,0,13
4,1,4,1,9.84,14.395,75,0,1


In [18]:
allDataSet_origin.weathersit = allDataSet_origin.weathersit.astype("int64")
Counter(allDataSet_origin.weathersit)

allDataSet_origin.month = allDataSet_origin.month.astype("int64")
Counter(allDataSet_origin.month)

allDataSet_origin.hr = allDataSet_origin.hr.astype("int64")
Counter(allDataSet_origin.hr)

Counter({1: 11399, 2: 4536, 3: 1419, 4: 3})

Counter({1: 1423,
         2: 1340,
         3: 1471,
         4: 1437,
         5: 1487,
         6: 1438,
         7: 1487,
         8: 1471,
         9: 1435,
         10: 1449,
         11: 1436,
         12: 1483})

Counter({0: 723,
         1: 724,
         2: 714,
         3: 696,
         4: 697,
         5: 715,
         6: 725,
         7: 725,
         8: 727,
         9: 725,
         10: 727,
         11: 725,
         12: 727,
         13: 729,
         14: 727,
         15: 729,
         16: 729,
         17: 728,
         18: 728,
         19: 728,
         20: 727,
         21: 728,
         22: 727,
         23: 727})

# 对离散变量进行编码

In [19]:
def encodeCategoricalFeatures(dataSet):
    categorical_features = ['month', 'hr', 'weathersit'] #所有离散变量
    for feature in categorical_features:
        #创键 哑变量
        dummies = pd.get_dummies(dataSet[feature], prefix=feature, drop_first=False)
        #合并数据集
        dataSet = pd.concat([dummies, dataSet], axis=1)
    return dataSet.drop(categorical_features, axis=1)

allDataSet = encodeCategoricalFeatures(allDataSet_origin)

allDataSet.shape
allDataSet.columns
allDataSet.head()

(17357, 45)

Index(['weathersit_1', 'weathersit_2', 'weathersit_3', 'weathersit_4', 'hr_0',
       'hr_1', 'hr_2', 'hr_3', 'hr_4', 'hr_5', 'hr_6', 'hr_7', 'hr_8', 'hr_9',
       'hr_10', 'hr_11', 'hr_12', 'hr_13', 'hr_14', 'hr_15', 'hr_16', 'hr_17',
       'hr_18', 'hr_19', 'hr_20', 'hr_21', 'hr_22', 'hr_23', 'month_1',
       'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7',
       'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'temp',
       'atemp', 'hum', 'windspeed', 'cnt'],
      dtype='object')

Unnamed: 0,weathersit_1,weathersit_2,weathersit_3,weathersit_4,hr_0,hr_1,hr_2,hr_3,hr_4,hr_5,...,month_8,month_9,month_10,month_11,month_12,temp,atemp,hum,windspeed,cnt
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.84,14.395,81,0,16
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.02,13.635,80,0,40
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.02,13.635,80,0,32
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.84,14.395,75,0,13
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.84,14.395,75,0,1


# 还原训练集和测试集
对两个数据集要一并处理的步骤已经完成，这里先把两个数据集分开回去。

In [20]:
divideLine = testSet_origin.shape[0]
trainSet_origin2 = allDataSet[:-divideLine].astype('float64')
testSet_origin2 = allDataSet[-divideLine:].astype('float64', raise_on_error=False)

trainSet_origin2.shape, testSet_origin2.shape

((16615, 45), (742, 45))

In [21]:
assert(trainSet_origin2.shape[0] == trainSet_origin.shape[0])
assert(testSet_origin2.shape[0] == testSet_origin.shape[0])

trainSet_origin2.head()
testSet_origin2.head()

Unnamed: 0,weathersit_1,weathersit_2,weathersit_3,weathersit_4,hr_0,hr_1,hr_2,hr_3,hr_4,hr_5,...,month_8,month_9,month_10,month_11,month_12,temp,atemp,hum,windspeed,cnt
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.84,14.395,81.0,0.0,16.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.02,13.635,80.0,0.0,40.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.02,13.635,80.0,0.0,32.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.84,14.395,75.0,0.0,13.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.84,14.395,75.0,0.0,1.0


Unnamed: 0,weathersit_1,weathersit_2,weathersit_3,weathersit_4,hr_0,hr_1,hr_2,hr_3,hr_4,hr_5,...,month_8,month_9,month_10,month_11,month_12,temp,atemp,hum,windspeed,cnt
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,10.66,15.15,81,0.0,?
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,10.66,15.15,81,0.0,?
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,10.66,15.15,81,0.0,?
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,10.66,13.635,81,8.9981,?
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,10.66,14.395,81,6.0032,?


# 去除测试集的无关数据

测试集最后一列（也即是cnt特征那一列）全部都是问号，这里便将其最后一列删除，以适应后续算法的数据要求。

In [22]:
testSet_origin2.drop(['cnt'], axis=1, inplace=True)
testSet_origin2.shape

(742, 44)

# 从训练集中划分验证集

典型的从训练集中划分验证集的方法是：划分训练集中的splitRate%为验证集，划分过程采用随机选取的方式。但是这里**由于数据是有时间序列特性的，因此便采取划分训练集的后splitRate%为验证集**。

In [23]:
#划分比例
splitRate = 0.18
#划分的数目
splitNum = int(trainSet_origin2.shape[0]*splitRate) 
#得到 训练集 和验证集
trainSet = trainSet_origin2[:-splitNum].reset_index(drop=True)
validateSet = trainSet_origin2[-splitNum:].reset_index(drop=True)

trainSet.shape, validateSet.shape
trainSet.head(3)
validateSet.head(3)

((13625, 45), (2990, 45))

Unnamed: 0,weathersit_1,weathersit_2,weathersit_3,weathersit_4,hr_0,hr_1,hr_2,hr_3,hr_4,hr_5,...,month_8,month_9,month_10,month_11,month_12,temp,atemp,hum,windspeed,cnt
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.84,14.395,81.0,0.0,16.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.02,13.635,80.0,0.0,40.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.02,13.635,80.0,0.0,32.0


Unnamed: 0,weathersit_1,weathersit_2,weathersit_3,weathersit_4,hr_0,hr_1,hr_2,hr_3,hr_4,hr_5,...,month_8,month_9,month_10,month_11,month_12,temp,atemp,hum,windspeed,cnt
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,34.44,39.395,49.0,23.9994,299.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,34.44,38.635,47.0,19.0012,314.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,35.26,40.15,47.0,19.9995,326.0


# 保存未归一化的数据

In [24]:
testSet = testSet_origin2

In [25]:
trainSet.shape, validateSet.shape, testSet.shape

dirPath = "data preprocessed\\unnormalized\\remove-feature-[day]"
if not os.path.exists(dirPath):
    os.makedirs(dirPath)
    
trainSet.to_csv(dirPath + '\\train.csv', index=False, header=False)
validateSet.to_csv(dirPath + '\\validate.csv', index=False, header=False)
testSet.to_csv(dirPath + '\\test.csv', index=False, header=False)

((13625, 45), (2990, 45), (742, 44))

# 测试数据读取

In [26]:
t = np.loadtxt(dirPath + '\\test.csv', delimiter=",")
t.shape, t.dtype
t[:1]

((742, 44), dtype('float64'))

array([[  1.  ,   0.  ,   0.  ,   0.  ,   1.  ,   0.  ,   0.  ,   0.  ,
          0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
          0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
          0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
          0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   1.  ,
         10.66,  15.15,  81.  ,   0.  ]])

# 标准化连续特征

In [27]:
def normalizeFeature(train, validate, test, continousFeature):
    means_, stds_ = [], []
    for feature in continousFeature:
        mean_, std_ = train[feature].mean(), train[feature].std()
        featureIndex = list(trainSet.columns).index(feature)
        means_.append({featureIndex:mean_})
        stds_.append({featureIndex:std_})
        
        train[feature] = (train[feature] - mean_)/std_
        validate[feature] = (validate[feature] - mean_)/std_
        test[feature] = ((test[feature] - mean_)/std_).astype("float64")

    return train, validate, test, means_, stds_
        
    
continousFeatures = ["temp", "hum", "windspeed", "atemp"]
continousFeatures
trainSet, validateSet, testSet, means, stds = normalizeFeature(trainSet, validateSet, testSet, continousFeatures)

trainSet[continousFeatures].head(3)
validateSet[continousFeatures].head(3)
testSet[continousFeatures].head(3)

['temp', 'hum', 'windspeed', 'atemp']

Unnamed: 0,temp,hum,windspeed,atemp
0,-1.296485,0.94185,-1.57558,-1.060552
1,-1.398401,0.89161,-1.57558,-1.147282
2,-1.398401,0.89161,-1.57558,-1.147282


Unnamed: 0,temp,hum,windspeed,atemp
0,1.760997,-0.665811,1.327879,1.792408
1,1.760997,-0.76629,0.723194,1.705678
2,1.862913,-0.76629,0.843969,1.878567


Unnamed: 0,temp,hum,windspeed,atemp
0,-1.194568,0.94185,-1.57558,-0.974393
1,-1.194568,0.94185,-1.57558,-0.974393
2,-1.194568,0.94185,-1.57558,-0.974393


In [28]:
means,stds

([{40: 20.27130275229354},
  {42: 62.25277064220184},
  {43: 13.023422627523388},
  {41: 23.688439633027492}],
 [{40: 8.045836616515906},
  {42: 19.9046896960437},
  {43: 8.265797922320854},
  {41: 8.762828750866511}])

In [29]:
trainSet[continousFeatures].describe()
validateSet[continousFeatures].describe()
testSet[continousFeatures].describe()

Unnamed: 0,temp,hum,windspeed,atemp
count,13625.0,13625.0,13625.0,13625.0
mean,-1.56779e-15,-3.824871e-16,-5.322604e-14,2.226834e-15
std,1.0,1.0,1.0,1.0
min,-2.417561,-3.127543,-1.57558,-2.703287
25%,-0.7869042,-0.8165297,-0.7285349,-0.8015037
50%,0.0284243,-0.01269905,-0.003075641,0.06294319
75%,0.8437528,0.841371,0.7231942,0.8412307
max,2.576326,1.896399,5.319931,3.002633


Unnamed: 0,temp,hum,windspeed,atemp
count,2990.0,2990.0,2990.0,2990.0
mean,0.292622,0.06303,-0.183782,0.280813
std,0.842672,0.823191,0.9179,0.836878
min,-1.500317,-2.323712,-1.57558,-1.579221
25%,-0.37924,-0.615572,-0.728535,-0.368995
50%,0.436089,0.138019,-0.244625,0.322563
75%,0.945669,0.740892,0.480834,0.927961
max,2.066746,1.896399,3.626653,2.138186


Unnamed: 0,temp,hum,windspeed,atemp
count,742.0,742.0,742.0,742.0
mean,-0.873299,0.302211,-0.073226,-0.862759
std,0.429266,0.891361,1.065969,0.486872
min,-1.806065,-1.570121,-1.57558,-1.83884
25%,-1.194568,-0.464854,-0.728535,-1.233442
50%,-0.88882,0.238498,-0.244625,-0.887663
75%,-0.583072,1.243286,0.480834,-0.455725
max,0.538005,1.896399,3.747427,0.841231


# 保存归一化的数据

In [30]:
trainSet.shape, validateSet.shape, testSet.shape

dirPath = "data preprocessed\\normalized\\remove-feature-[day]"
if not os.path.exists(dirPath):
    os.makedirs(dirPath)
    
trainSet.to_csv(dirPath + '\\train.csv', index=False, header=False)
validateSet.to_csv(dirPath + '\\validate.csv', index=False, header=False)
testSet.to_csv(dirPath + '\\test.csv', index=False, header=False)

((13625, 45), (2990, 45), (742, 44))

# 测试数据读取

In [31]:
t = np.loadtxt(dirPath + '\\test.csv', delimiter=",")
t.shape, t.dtype
t[:1]

((742, 44), dtype('float64'))

array([[ 1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        -1.19456847, -0.97439307,  0.94184987, -1.57557961]])