In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
import pdir as pr
import pandas as pd
import os
from tqdm import tqdm, tnrange, tqdm_notebook

DF = pd.DataFrame

# 读取训练集和测试集

In [2]:
columnsName = ["C" + str(i) for i in range(14)]
trainSet_origin = pd.read_csv('data\\train.csv',  names = columnsName)
trainSet_origin.shape
trainSet_origin.head(5)

(48000, 14)

Unnamed: 0,C0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13
0,3003,type_C,15,867,204,2961,219,207,120.0,740.0,1.0,3.0,2.665508,1
1,3138,type_D,10,43,2,5583,196,243,187.0,2401.0,1.0,3.0,2.358066,1
2,2789,type_C,13,574,4,1650,234,213,110.0,1021.0,1.0,3.0,2.06706,1
3,2824,type_C,33,336,237,159,29,2260,1.0,0.0,0.0,0.0,2.698755,1
4,3264,type_D,11,515,39,1200,239,226,117.0,2206.0,1.0,3.0,2.513698,1


In [3]:
testSet_origin = pd.read_csv('data\\test.csv',  names = columnsName[:-1])
testSet_origin.shape
testSet_origin.head(5)

(12000, 13)

Unnamed: 0,C0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12
0,2883,type_C,14,475,71,1718,244,223,106.0,2851.0,1.0,3.0,2.288722
1,2546,type_B,9,31,2,1457,218,223,142.0,1290.0,1.0,3.0,2.053396
2,3178,type_D,15,286,35,2999,230,208,110.0,4503.0,1.0,3.0,2.648367
3,3282,type_D,23,451,116,1679,248,228,96.0,1404.0,1.0,3.0,2.985833
4,2954,type_C,23,525,304,1828,160,246,221.0,1623.0,1.0,3.0,2.00947


# 对离散变量进行0-1编码

数据集共有三个离散特征。

In [4]:
trainSet_origin["C1"].unique()  #需编码
trainSet_origin["C10"].unique() #无需改动
trainSet_origin["C11"].unique() #将3替代为1

testSet_origin["C1"].unique()  #需编码
testSet_origin["C10"].unique() #无需改动
testSet_origin["C11"].unique() #将3替代为1

array(['type_C', 'type_D', 'type_A', 'type_B', 'type_E'], dtype=object)

array([ 1.,  0.])

array([ 3.,  0.])

array(['type_C', 'type_B', 'type_D', 'type_A', 'type_E'], dtype=object)

array([ 1.,  0.])

array([ 3.,  0.])

In [5]:
trainSet_origin["C11"] = trainSet_origin["C11"].replace(3, 1)
trainSet_origin["C11"].unique()

testSet_origin["C11"] = testSet_origin["C11"].replace(3, 1)
testSet_origin["C11"].unique()

array([ 1.,  0.])

array([ 1.,  0.])

In [6]:
################ 注意这个 cell 只需运行一次 ########################
def reduceDummies(dataSet):    
    #创键 哑变量
    dummies = pd.get_dummies(dataSet["C1"], prefix="C1", drop_first=False)
    #合并数据集
    dataSet = pd.concat([dummies, dataSet], axis=1)
    return dataSet.drop(["C1"], axis=1)

trainSet_origin = reduceDummies(trainSet_origin)
trainSet_origin.head(5)

testSet_origin = reduceDummies(testSet_origin)
testSet_origin.head(5)

Unnamed: 0,C1_type_A,C1_type_B,C1_type_C,C1_type_D,C1_type_E,C0,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13
0,0.0,0.0,1.0,0.0,0.0,3003,15,867,204,2961,219,207,120.0,740.0,1.0,1.0,2.665508,1
1,0.0,0.0,0.0,1.0,0.0,3138,10,43,2,5583,196,243,187.0,2401.0,1.0,1.0,2.358066,1
2,0.0,0.0,1.0,0.0,0.0,2789,13,574,4,1650,234,213,110.0,1021.0,1.0,1.0,2.06706,1
3,0.0,0.0,1.0,0.0,0.0,2824,33,336,237,159,29,2260,1.0,0.0,0.0,0.0,2.698755,1
4,0.0,0.0,0.0,1.0,0.0,3264,11,515,39,1200,239,226,117.0,2206.0,1.0,1.0,2.513698,1


Unnamed: 0,C1_type_A,C1_type_B,C1_type_C,C1_type_D,C1_type_E,C0,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12
0,0.0,0.0,1.0,0.0,0.0,2883,14,475,71,1718,244,223,106.0,2851.0,1.0,1.0,2.288722
1,0.0,1.0,0.0,0.0,0.0,2546,9,31,2,1457,218,223,142.0,1290.0,1.0,1.0,2.053396
2,0.0,0.0,0.0,1.0,0.0,3178,15,286,35,2999,230,208,110.0,4503.0,1.0,1.0,2.648367
3,0.0,0.0,0.0,1.0,0.0,3282,23,451,116,1679,248,228,96.0,1404.0,1.0,1.0,2.985833
4,0.0,0.0,1.0,0.0,0.0,2954,23,525,304,1828,160,246,221.0,1623.0,1.0,1.0,2.00947


# 从训练集中划分验证集

**需要注意，划分验证集需在归一化连续特征之前完成。**

突发奇想：若是后来利用验证集调好了参数，在判断增加样本个数可提高算法performance的前提下，可使用整个原始训练集来进行训练。不过这个时候就没有验证集了，到时候权衡一下。

In [7]:
#首先打乱原始训练集
trainSet_origin = trainSet_origin.sample(frac=1).reset_index(drop=True)
trainSet_origin.shape

(48000, 18)

In [8]:
#划分比例
splitRate = 0.3
#划分的数目
splitNum = int(trainSet_origin.shape[0]*splitRate) 
#得到 训练集 和验证集
trainSet = trainSet_origin[:-splitNum].reset_index(drop=True)
validateSet = trainSet_origin[-splitNum:].reset_index(drop=True)

trainSet.shape, validateSet.shape
trainSet.head(3)
validateSet.head(3)

((33600, 18), (14400, 18))

Unnamed: 0,C1_type_A,C1_type_B,C1_type_C,C1_type_D,C1_type_E,C0,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13
0,1.0,0.0,0.0,0.0,0.0,2039,25,43,20,107,144,218,212.0,637.0,1.0,1.0,2.039711,0
1,0.0,0.0,1.0,0.0,0.0,2728,20,121,34,1553,240,239,120.0,2417.0,1.0,1.0,2.672326,1
2,0.0,0.0,1.0,0.0,0.0,2858,15,451,87,2476,197,213,154.0,1610.0,1.0,1.0,2.069699,1


Unnamed: 0,C1_type_A,C1_type_B,C1_type_C,C1_type_D,C1_type_E,C0,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13
0,0.0,0.0,0.0,1.0,0.0,3085,10,1349,286,3068,223,219,131.0,1220.0,1.0,1.0,2.915876,1
1,0.0,0.0,0.0,1.0,0.0,3208,15,367,112,3158,205,254,184.0,844.0,1.0,1.0,2.732953,0
2,0.0,0.0,1.0,0.0,0.0,2736,11,31,-10,1424,238,222,114.0,2376.0,1.0,1.0,2.791938,0


# 保存未归一化的数据

In [9]:
trainSet.shape, validateSet.shape, testSet_origin.shape

dirPath = "data preprocessed\\unnormalized"
if not os.path.exists(dirPath):
    os.makesdir(dirPath)
    
trainSet.to_csv(dirPath + '\\train.csv', index=False, header=False)
validateSet.to_csv(dirPath + '\\validate.csv', index=False, header=False)
testSet_origin.to_csv(dirPath + '\\test.csv', index=False, header=False)

((33600, 18), (14400, 18), (12000, 17))

# 对连续特征进行PCA降维

In [10]:
continousFeatures = ["C" + str(i) for i in [0] + list(range(2,10)) + [12]]
continousFeatures

['C0', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C12']

In [14]:
allFeatures = list(trainSet.columns)
discreteFeatures = [i for i in allFeatures if i not in continousFeatures]
discreteFeatures #注意C13是标签

['C1_type_A',
 'C1_type_B',
 'C1_type_C',
 'C1_type_D',
 'C1_type_E',
 'C10',
 'C11',
 'C13']

In [15]:
discreteData = trainSet[discreteFeatures]
discreteData.shape

(33600, 8)

# 测试数据读取

In [186]:
t = np.loadtxt(dirPath + '\\train.csv', delimiter=",")
t.shape, t.dtype
t[:3]

((33600, 18), dtype('float64'))

array([[ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        -2.97156701, -0.03864404, -0.12375666, -0.26388139, -1.06551351,
        -0.33827607, -0.17353922, -0.08466038, -0.83072648,  1.        ,
         1.        , -1.32014105,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.39137413, -0.08399725,  0.17147297, -0.10507631,  0.59021486,
        -0.29140791, -0.1090904 , -0.02395491,  0.01650887,  1.        ,
         1.        , -1.22164759,  0.        ],
       [ 0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        -0.40131951, -0.14068876, -0.12740147, -0.16438905,  0.79341789,
         0.31787819, -0.12686939, -0.14536585,  3.30018111,  1.        ,
         1.        ,  1.15962009,  1.        ]])

# 参考资料

- 1.[Shuffle DataFrame rows][1]

[1]:https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows