In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
import pdir as pr
import pandas as pd
import os
from tqdm import tqdm, tnrange, tqdm_notebook

DF = pd.DataFrame

# 读取训练集和测试集

In [5]:
columnsName = ["C" + str(i) for i in range(14)]
trainSet_origin = pd.read_csv('data\\train.csv',  names = columnsName)
trainSet_origin.shape
trainSet_origin.head(5)

(48000, 14)

Unnamed: 0,C0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13
0,3003,type_C,15,867,204,2961,219,207,120.0,740.0,1.0,3.0,2.665508,1
1,3138,type_D,10,43,2,5583,196,243,187.0,2401.0,1.0,3.0,2.358066,1
2,2789,type_C,13,574,4,1650,234,213,110.0,1021.0,1.0,3.0,2.06706,1
3,2824,type_C,33,336,237,159,29,2260,1.0,0.0,0.0,0.0,2.698755,1
4,3264,type_D,11,515,39,1200,239,226,117.0,2206.0,1.0,3.0,2.513698,1


In [6]:
testSet_origin = pd.read_csv('data\\test.csv',  names = columnsName[:-1])
testSet_origin.shape
testSet_origin.head(5)

(12000, 13)

Unnamed: 0,C0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12
0,2883,type_C,14,475,71,1718,244,223,106.0,2851.0,1.0,3.0,2.288722
1,2546,type_B,9,31,2,1457,218,223,142.0,1290.0,1.0,3.0,2.053396
2,3178,type_D,15,286,35,2999,230,208,110.0,4503.0,1.0,3.0,2.648367
3,3282,type_D,23,451,116,1679,248,228,96.0,1404.0,1.0,3.0,2.985833
4,2954,type_C,23,525,304,1828,160,246,221.0,1623.0,1.0,3.0,2.00947


# 对离散变量进行0-1编码

数据集共有三个离散特征。

In [7]:
trainSet_origin["C1"].unique()  #需编码
trainSet_origin["C10"].unique() #无需改动
trainSet_origin["C11"].unique() #将3替代为1

testSet_origin["C1"].unique()  #需编码
testSet_origin["C10"].unique() #无需改动
testSet_origin["C11"].unique() #将3替代为1

array(['type_C', 'type_D', 'type_A', 'type_B', 'type_E'], dtype=object)

array([ 1.,  0.])

array([ 3.,  0.])

array(['type_C', 'type_B', 'type_D', 'type_A', 'type_E'], dtype=object)

array([ 1.,  0.])

array([ 3.,  0.])

In [8]:
trainSet_origin["C11"] = trainSet_origin["C11"].replace(3, 1)
trainSet_origin["C11"].unique()

testSet_origin["C11"] = testSet_origin["C11"].replace(3, 1)
testSet_origin["C11"].unique()

array([ 1.,  0.])

array([ 1.,  0.])

In [9]:
################ 注意这个 cell 只需运行一次 ########################
def reduceDummies(dataSet):    
    #创键 哑变量
    dummies = pd.get_dummies(dataSet["C1"], prefix="C1", drop_first=False)
    #合并数据集
    dataSet = pd.concat([dummies, dataSet], axis=1)
    return dataSet.drop(["C1"], axis=1)

trainSet_origin = reduceDummies(trainSet_origin)
trainSet_origin.head(5)

testSet_origin = reduceDummies(testSet_origin)
testSet_origin.head(5)

Unnamed: 0,C1_type_A,C1_type_B,C1_type_C,C1_type_D,C1_type_E,C0,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13
0,0.0,0.0,1.0,0.0,0.0,3003,15,867,204,2961,219,207,120.0,740.0,1.0,1.0,2.665508,1
1,0.0,0.0,0.0,1.0,0.0,3138,10,43,2,5583,196,243,187.0,2401.0,1.0,1.0,2.358066,1
2,0.0,0.0,1.0,0.0,0.0,2789,13,574,4,1650,234,213,110.0,1021.0,1.0,1.0,2.06706,1
3,0.0,0.0,1.0,0.0,0.0,2824,33,336,237,159,29,2260,1.0,0.0,0.0,0.0,2.698755,1
4,0.0,0.0,0.0,1.0,0.0,3264,11,515,39,1200,239,226,117.0,2206.0,1.0,1.0,2.513698,1


Unnamed: 0,C1_type_A,C1_type_B,C1_type_C,C1_type_D,C1_type_E,C0,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12
0,0.0,0.0,1.0,0.0,0.0,2883,14,475,71,1718,244,223,106.0,2851.0,1.0,1.0,2.288722
1,0.0,1.0,0.0,0.0,0.0,2546,9,31,2,1457,218,223,142.0,1290.0,1.0,1.0,2.053396
2,0.0,0.0,0.0,1.0,0.0,3178,15,286,35,2999,230,208,110.0,4503.0,1.0,1.0,2.648367
3,0.0,0.0,0.0,1.0,0.0,3282,23,451,116,1679,248,228,96.0,1404.0,1.0,1.0,2.985833
4,0.0,0.0,1.0,0.0,0.0,2954,23,525,304,1828,160,246,221.0,1623.0,1.0,1.0,2.00947


# 从训练集中划分验证集

**需要注意，划分验证集需在归一化连续特征之前完成。**

突发奇想：若是后来利用验证集调好了参数，在判断增加样本个数可提高算法performance的前提下，可使用整个原始训练集来进行训练。不过这个时候就没有验证集了，到时候权衡一下。

In [10]:
#首先打乱原始训练集
trainSet_origin = trainSet_origin.sample(frac=1).reset_index(drop=True)
trainSet_origin.shape

(48000, 18)

In [11]:
#划分比例
splitRate = 0.3
#划分的数目
splitNum = int(trainSet_origin.shape[0]*splitRate) 
#得到 训练集 和验证集
trainSet = trainSet_origin[:-splitNum].reset_index(drop=True)
validateSet = trainSet_origin[-splitNum:].reset_index(drop=True)

trainSet.shape, validateSet.shape
trainSet.head(3)
validateSet.head(3)

((33600, 18), (14400, 18))

Unnamed: 0,C1_type_A,C1_type_B,C1_type_C,C1_type_D,C1_type_E,C0,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13
0,0.0,0.0,0.0,1.0,0.0,3193,14,284,52,3689,184,231,187.0,1910.0,1.0,1.0,2.342163,0
1,0.0,0.0,1.0,0.0,0.0,2928,21,163,67,2820,173,251,213.0,2209.0,1.0,1.0,2.243512,1
2,0.0,0.0,0.0,1.0,0.0,3140,26,151,48,3587,229,242,129.0,725.0,1.0,1.0,2.800173,1


Unnamed: 0,C1_type_A,C1_type_B,C1_type_C,C1_type_D,C1_type_E,C0,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13
0,0.0,0.0,1.0,0.0,0.0,3064,19,928,248,220,94,1711,1.0,0.0,0.0,0.0,2.256256,1
1,0.0,0.0,1.0,0.0,0.0,2887,11,213,0,1761,230,246,148.0,1513.0,1.0,1.0,2.743451,1
2,0.0,0.0,0.0,1.0,0.0,3157,20,286,50,209,162,231,209.0,798.0,1.0,1.0,2.95134,0


# 保存未归一化的数据

In [15]:
trainSet.shape, validateSet.shape, testSet_origin.shape

dirPath = "data preprocessed\\unnormalized"
if not os.path.exists(dirPath):
    os.mkdir(dirPath)
    
trainSet.to_csv(dirPath + '\\train.csv', index=False, header=False)
validateSet.to_csv(dirPath + '\\validate.csv', index=False, header=False)
testSet_origin.to_csv(dirPath + '\\test.csv', index=False, header=False)

((33600, 18), (14400, 18), (12000, 17))

# 标准化连续特征

In [181]:
def normalizeFeature(train, validate, test):
    means_, stds_ = [], []
    for feature in continousFeature:
        mean_, std_ = train[feature].mean(), train[feature].std()
        featureIndex = list(trainSet.columns).index(feature)
        means_.append({featureIndex:mean_})
        stds_.append({featureIndex:std_})
        
        train[feature] = (train[feature] - mean_)/std_
        validate[feature] = (validate[feature] - mean_)/std_
        test[feature] = (test[feature] - mean_)/std_

    return train, validate, test, means_, stds_
        
    
continousFeatures = ["C" + str(i) for i in [0] + list(range(2,10)) + [12]]
continousFeatures
trainSet, validateSet, testSet, means, stds = normalizeFeature(trainSet, validateSet, testSet_origin)
trainSet.head(3)
validateSet.head(3)

['C0', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C12']

Unnamed: 0,C1_type_A,C1_type_B,C1_type_C,C1_type_D,C1_type_E,C0,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13
0,1.0,0.0,0.0,0.0,0.0,-2.971567,-0.038644,-0.123757,-0.263881,-1.065514,-0.338276,-0.173539,-0.08466,-0.830726,1.0,1.0,-1.320141,0
1,0.0,0.0,0.0,1.0,0.0,1.391374,-0.083997,0.171473,-0.105076,0.590215,-0.291408,-0.10909,-0.023955,0.016509,1.0,1.0,-1.221648,0
2,0.0,0.0,1.0,0.0,0.0,-0.40132,-0.140689,-0.127401,-0.164389,0.793418,0.317878,-0.126869,-0.145366,3.300181,1.0,1.0,1.15962,1


Unnamed: 0,C1_type_A,C1_type_B,C1_type_C,C1_type_D,C1_type_E,C0,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13
0,0.0,1.0,0.0,0.0,0.0,-1.182473,-0.12935,-0.333333,-0.269621,-0.748166,0.317878,-0.124647,-0.143118,-0.819837,1.0,1.0,-1.526157,0
1,0.0,0.0,1.0,0.0,0.0,0.235843,0.120092,0.765577,-0.37868,1.179755,-0.728844,-0.093534,0.021012,-0.421992,1.0,1.0,-1.546969,1
2,0.0,0.0,1.0,0.0,0.0,-0.678503,-0.049982,-0.420809,-0.240922,-0.378763,0.44286,-0.175762,-0.22181,0.139202,1.0,1.0,-1.268598,1


验证归一化的结果：可看到训练集归一化后的均值接近0，方差为1。

In [183]:
means,stds

([{5: 2961.484255952381},
  {6: 18.40827380952381},
  {7: 365.9084226190476},
  {8: 133.91848214285713},
  {9: 2181.923392857143},
  {10: 212.65282738095237},
  {11: 294.08735119047617},
  {12: 201.65443452380953},
  {13: 1828.2603273809523},
  {16: 2.5002336610946587}],
 [{5: 277.7942586850557},
  {6: 88.19662556049181},
  {7: 548.7254071066931},
  {8: 522.6533148333067},
  {9: 1594.4644302132433},
  {10: 64.00933768067485},
  {11: 449.96946950492224},
  {12: 444.7704396312076},
  {13: 1377.4212770402787},
  {16: 0.28841626123617103}])

In [184]:
trainSet[continousFeatures].describe()
validateSet[continousFeatures].describe()
testSet[continousFeatures].describe()

Unnamed: 0,C0,C2,C3,C4,C5,C6,C7,C8,C9,C12
count,33600.0,33600.0,33600.0,33600.0,33600.0,33600.0,33600.0,33600.0,33600.0,33600.0
mean,3.272911e-16,2.2116900000000003e-17,-1.998344e-16,4.711179e-17,-7.994927e-17,1.102689e-16,9.717590000000001e-17,4.630349e-17,4.740586e-17,-4.391147e-15
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-3.864314,-0.1973803,-0.7889345,-0.571925,-1.350249,-3.275348,-0.649127,-0.4533899,-1.327307,-1.734235
25%,-0.5345116,-0.1066739,-0.4390327,-0.2351817,-0.8036701,-0.2445397,-0.1824287,-0.1948296,-0.7109374,-0.8676936
50%,0.1314489,-0.06132064,-0.2239889,-0.1854355,-0.223538,0.0991601,-0.1490931,-0.1341241,-0.1700717,-0.003234596
75%,0.7227138,0.00670917,0.1003992,-0.0897698,0.6222005,0.3178782,-0.1202023,-0.06892192,0.4629954,0.8699742
max,3.220066,61.64172,11.61618,13.05661,3.091368,79.35322,15.04305,14.90509,3.833787,1.732755


Unnamed: 0,C0,C2,C3,C4,C5,C6,C7,C8,C9,C12
count,14400.0,14400.0,14400.0,14400.0,14400.0,14400.0,14400.0,14400.0,14400.0,14400.0
mean,-0.00877,-0.008639,0.007068,0.006372,0.008602,-0.001839,0.006296,7e-05,-0.010498,0.003518
std,1.011991,0.918315,1.015094,1.025485,1.008987,1.033431,1.017774,0.97565,0.987218,1.002248
min,-3.885913,-0.19738,-0.726973,-0.547052,-1.350249,-2.54108,-0.649127,-0.451142,-1.327307,-1.734325
25%,-0.541711,-0.106674,-0.439033,-0.233268,-0.799593,-0.24454,-0.182429,-0.19483,-0.705129,-0.860737
50%,0.12065,-0.061321,-0.223989,-0.183522,-0.210054,0.083537,-0.149093,-0.136372,-0.17225,0.002643
75%,0.721814,0.006709,0.107689,-0.08977,0.628472,0.317878,-0.120202,-0.068922,0.449746,0.873799
max,3.202067,58.024802,12.03533,13.039392,2.986004,64.730355,13.978532,14.280053,3.855567,1.732723


Unnamed: 0,C0,C2,C3,C4,C5,C6,C7,C8,C9,C12
count,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0
mean,-0.020928,0.000127,-0.009228,0.003292,0.00315,0.019763,-0.002414,-0.000976,-0.005448,-0.018701
std,1.00667,0.845019,0.956645,1.011605,1.00545,1.659503,0.999367,0.976953,0.989357,0.997275
min,-3.893112,-0.231395,-0.688702,-0.552792,-1.350249,-3.275348,-0.649127,-0.45339,-1.327307,-1.734397
25%,-0.570509,-0.106674,-0.439033,-0.235182,-0.799593,-0.24454,-0.182429,-0.19483,-0.714567,-0.881846
50%,0.11345,-0.061321,-0.223989,-0.185436,-0.24204,0.09916,-0.149093,-0.136372,-0.170072,-0.031567
75%,0.719114,0.006709,0.100399,-0.08977,0.63788,0.333501,-0.120202,-0.066674,0.464447,0.835479
max,3.169669,44.645605,11.765614,13.136971,3.03994,93.92922,15.085274,15.033251,3.765543,1.73278


# 保存归一化的数据

In [185]:
trainSet.shape, validateSet.shape, testSet.shape

dirPath = "data preprocessed\\normalized"
if not os.path.exists(dirPath):
    os.makedirs(dirPath)
    
trainSet.to_csv(dirPath + '\\train.csv', index=False, header=False)
validateSet.to_csv(dirPath + '\\validate.csv', index=False, header=False)
testSet.to_csv(dirPath + '\\test.csv', index=False, header=False)

((33600, 18), (14400, 18), (12000, 17))

# 测试数据读取

In [186]:
t = np.loadtxt(dirPath + '\\train.csv', delimiter=",")
t.shape, t.dtype
t[:3]

((33600, 18), dtype('float64'))

array([[ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        -2.97156701, -0.03864404, -0.12375666, -0.26388139, -1.06551351,
        -0.33827607, -0.17353922, -0.08466038, -0.83072648,  1.        ,
         1.        , -1.32014105,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.39137413, -0.08399725,  0.17147297, -0.10507631,  0.59021486,
        -0.29140791, -0.1090904 , -0.02395491,  0.01650887,  1.        ,
         1.        , -1.22164759,  0.        ],
       [ 0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        -0.40131951, -0.14068876, -0.12740147, -0.16438905,  0.79341789,
         0.31787819, -0.12686939, -0.14536585,  3.30018111,  1.        ,
         1.        ,  1.15962009,  1.        ]])

# 参考资料

- 1.[Shuffle DataFrame rows][1]

[1]:https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows