# 数据集统计信息
- 1. 查看数据是否存在缺失值
- 2. 查看都存在那些特征, 特征的类型是什么
- 3. 对label进行统计, 观察样本的不平衡度或者是否存在长尾效应
- 4. 观察训练集与测试集各个特征上的分布差异

In [1]:
import pickle
import pandas as pd
pd.set_option('max_columns', 200)
from utils import submit, evaluation

with open(r'./data/train_T.pkl', 'rb') as f:
    df_train = pickle.load(f)
with open(r'./data/test_T.pkl', 'rb') as f:
    df_test = pickle.load(f)

In [2]:
pos_rate = sum(df_train['label']) / df_train.shape[0]
print('The positive rate is %.5f, the negative rate is %.5f' % (pos_rate, 1-pos_rate))

The positive rate is 0.06893, the negative rate is 0.93107


## 训练集与测试集在各个特征上分布的差异
使用比率, 画图来进行比较

类别特征: 使用柱状图进行比较. 横轴为类别, 纵轴为比率.
连续特征: 使用分布图进行比较. 横轴为数值大小, 纵轴为比率. 用两种不同的颜色进行表示.

In [3]:
cate_feat = ['XINGBIE', 'HYZK', 'ZHIYE', 'ZHICHEN', 'XUELI', 'DWJJLX', 'DWSSHY', 'GRZHZT']
all_feat = list(df_train.columns)
all_feat.remove('id')
all_feat.remove('label')
cont_feat = all_feat
for item in cate_feat:
    cont_feat.remove(item)
cont_feat

['ZHIWU',
 'GRJCJS',
 'GRZHYE',
 'GRZHSNJZYE',
 'GRZHDNGJYE',
 'GRYJCE',
 'DWYJCE',
 'DKFFE',
 'DKYE',
 'DKLL',
 'age']

In [4]:
df_train[cate_feat].value_counts()

XINGBIE  HYZK  ZHIYE  ZHICHEN  XUELI  DWJJLX  DWSSHY  GRZHZT
1        3     6      3        1      1       18      0         2007
                                      9       7       0         1494
                                              2       0         1464
2        3     6      3        1      1       18      0         1233
1        3     6      3        1      9       14      0         1011
                                                                ... 
                                      11      5       4            1
2        3     6      3        1      1       16      4            1
                                      13      6       0            1
                                              7       0            1
0        3     6      3        1      1       18      0            1
Length: 635, dtype: int64

In [5]:
df_train.head(10)

Unnamed: 0,id,XINGBIE,HYZK,ZHIYE,ZHICHEN,ZHIWU,XUELI,DWJJLX,DWSSHY,GRZHZT,GRJCJS,GRZHYE,GRZHSNJZYE,GRZHDNGJYE,GRYJCE,DWYJCE,DKFFE,DKYE,DKLL,age,label
0,train_0,1,3,6,3,0,1,9,12,0,1737.0,3223.515,801.31,837.0,312.0,312.0,175237,154112.935,2.708,18,0.0
1,train_1,2,3,6,3,0,1,1,0,0,4894.0,18055.195,53213.22,1065.2,795.84,795.84,300237,298252.945,2.979,34,0.0
2,train_2,1,3,6,3,0,1,9,9,0,10297.0,27426.6,13963.14,7230.02,1444.2,1444.2,150237,147339.13,2.708,27,0.0
3,train_3,1,3,6,3,0,1,9,7,0,10071.5,111871.13,99701.265,2271.295,1417.14,1417.14,350237,300653.78,2.708,37,0.0
4,train_4,2,3,6,3,0,1,27,14,0,2007.0,237.0,11028.875,35.78,325.5,325.5,150237,145185.01,2.708,33,0.0
5,train_5,1,3,6,3,0,1,9,14,0,1192.0,9648.315,7388.55,771.8,303.85,303.85,150237,149743.21,2.708,35,0.0
6,train_6,1,3,6,3,0,1,11,2,0,7297.0,22846.975,12179.565,5885.0,801.8,801.8,150237,146723.725,2.708,33,0.0
7,train_7,1,3,6,3,0,1,9,7,0,1399.0,1808.135,6405.055,1050.4,318.34,318.34,125237,64938.955,2.708,16,0.0
8,train_8,2,3,6,3,0,1,9,9,0,6408.0,79304.635,52365.82,6161.16,977.52,977.52,300237,270579.955,2.979,31,0.0
9,train_9,2,3,6,3,0,1,9,14,0,3573.5,2009.0,61126.175,536.75,637.38,637.38,230237,142296.64,2.708,36,0.0
