In [1]:
# 定义常量
PREFIX = './数据/'
DATA_EXPORT_TO = PREFIX + '预处理结果/'
RANDOM_STATE = 233

CONTINUOUS_COLUMNS = [
    'age', 'cashTotalAmt', 'cashTotalCnt', 'monthCardLargeAmt',
    'onlineTransAmt', 'onlineTransCnt', 'publicPayAmt', 'publicPayCnt',
    'transTotalAmt', 'transTotalCnt', 'transCnt_non_null_months',
    'transAmt_mean', 'transAmt_non_null_months', 'cashCnt_mean',
    'cashCnt_non_null_months', 'cashAmt_mean', 'cashAmt_non_null_months',
    'card_age', 'trans_total', 'total_withdraw', 'avg_per_withdraw',
    'avg_per_online_spend', 'avg_per_public_spend', 'noTransWeekPre',
    'transCnt_mean'
]

In [2]:
# import 必要模块

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.preprocessing import PowerTransformer, minmax_scale
from sklearn.preprocessing import scale as standard_scale
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA

# 显示数据时显示所有列
pd.options.display.max_columns = None

# 设置字体
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['font.size'] = 11

# import 自己写的工具函数
import cookdata_util as cutil

In [3]:
my_globals = dict()
export_data = dict()

In [4]:
# 导入第 2 部分的阶段性数据
data = pd.read_csv(PREFIX + 'data-2.csv')
data = data.drop('bad_record', axis=1)

In [5]:
def data_distribution(raw_data, columns=None, graph='kde'):
    if columns is None:
        columns = raw_data.columns
    how_many_row = (len(columns) + 3) // 4
    
    fig, axs = plt.subplots(how_many_row, 4, figsize=(25, 5 * how_many_row))
    axs = axs.flatten()
    
    for i, col in enumerate(columns):        
        if graph == 'kde':
            sns.kdeplot(raw_data[col], ax=axs[i], shade=True)
        elif graph == 'box':
            sns.boxplot(y=raw_data[col], ax=axs[i])
        else:
            raise ValueError(graph)
            
        axs[i].set_title(f'{col} 数据')

In [6]:
# 构建去除离群点的数据
def build_lof_data(raw_data):
    my_data = raw_data.copy()    
    lof = LocalOutlierFactor(n_neighbors=20, contamination='auto', n_jobs=-1)
    data_x = my_data.drop(['Default'], axis=1)
    lof_res = lof.fit_predict(data_x)
    
    return my_data[lof_res == 1]

In [7]:
# 构建分箱、One-Hot 的数据
def build_qcut_data(raw_data):
    my_data = raw_data.copy()

    # 对 data 中所有连续型的列离散化
    for col in CONTINUOUS_COLUMNS:
        my_data[col] = pd.qcut(my_data[col], 5, duplicates='drop', labels=False).astype('category')

    # 对离散列做 One-Hot 编码
    return pd.get_dummies(my_data)

def build_cut_data(raw_data):
    my_data = raw_data.copy()

    # 对 data 中所有连续型的列离散化
    for col in CONTINUOUS_COLUMNS:
        my_data[col] = pd.cut(my_data[col], 5, duplicates='drop', labels=False).astype('category')

    # 对离散列做 One-Hot 编码
    return pd.get_dummies(my_data)

In [8]:
# 构建标准化的数据
def build_std_data(raw_data):
    my_data = raw_data.copy()
    
    for col in CONTINUOUS_COLUMNS:
        my_data[col] = standard_scale(my_data[col])
    
    return my_data

# 构建挤压到 [0, 1] 的数据
def build_minmax_data(raw_data):
    my_data = raw_data.copy()
    my_data.loc[:] = minmax_scale(my_data)

    return my_data

In [9]:
def build_pca_data(raw_data, n_components):
    assert raw_data.shape[1] > n_components + 1
    
    pca = PCA(n_components=n_components)
    data_X = raw_data.drop(['Default'], axis=1).values
    pca_data = pd.DataFrame(pca.fit_transform(data_X))
    pca_data['Default'] = raw_data['Default']
    return pca_data

In [10]:
def build_test_set(raw_data):
    y = raw_data['Default'].values
    x = raw_data.drop(['Default'], axis=1).values

    return train_test_split(x, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

In [11]:
# 清空 export_data
export_data = dict()

def anonymous_func():
    export_data['raw_data'] = data.copy()
    export_data['std(data)'] = build_std_data(data)
    export_data['minmax(data)'] = build_minmax_data(data)
        
    export_data['qcut(data)'] = build_qcut_data(data)
    export_data['cut(data)'] = build_cut_data(data)
    
    export_data['pca(std(data))'] = build_pca_data(export_data['std(data)'], 52)
    export_data['minmax(pca(std(data)))'] = build_minmax_data(export_data['pca(std(data))'])

anonymous_func()

In [12]:
# 保存处理后的数据
def anonymous_func():
    import os
    
    for file_name in os.listdir(DATA_EXPORT_TO):
        if file_name.endswith('.csv'):
            os.remove(f'{DATA_EXPORT_TO}{file_name}')
    
    for name, data in export_data.items():
        data.to_csv(f'{DATA_EXPORT_TO}{name}.csv', index=False)

anonymous_func()

In [13]:
def anonymous_func():
    rows, cols = [], []
    for dname, dvalue in export_data.items():
        rows.append(dvalue.shape[0])
        cols.append(dvalue.shape[1])
        
    return pd.DataFrame({'数据集': [*export_data.keys()], 'row': rows, 'col': cols})

anonymous_func()

Unnamed: 0,数据集,row,col
0,raw_data,47332,56
1,std(data),47332,56
2,minmax(data),47332,56
3,qcut(data),47332,137
4,cut(data),47332,153
5,pca(std(data)),47332,53
6,minmax(pca(std(data))),47332,53


In [14]:
export_data['cut(data)'].describe()

Unnamed: 0,Han,inCourt,isBlackList,isCrime,isDue,Default,maritalStatus_0,maritalStatus_1,maritalStatus_2,education_0,education_1,education_2,education_3,education_4,idVerify_0,idVerify_1,idVerify_2,threeVerify_0,threeVerify_1,threeVerify_2,netLength_0,netLength_1,netLength_2,netLength_3,netLength_4,sex_0,sex_1,sex_2,CityId_1,CityId_2,CityId_3,age_0,age_1,age_2,age_3,age_4,card_age_0,card_age_1,card_age_2,card_age_3,card_age_4,cashAmt_mean_0,cashAmt_mean_1,cashAmt_mean_2,cashAmt_mean_3,cashAmt_mean_4,cashAmt_non_null_months_0,cashAmt_non_null_months_1,cashAmt_non_null_months_2,cashAmt_non_null_months_3,cashAmt_non_null_months_4,cashCnt_mean_0,cashCnt_mean_1,cashCnt_mean_2,cashCnt_mean_3,cashCnt_mean_4,cashCnt_non_null_months_0,cashCnt_non_null_months_1,cashCnt_non_null_months_2,cashCnt_non_null_months_3,cashCnt_non_null_months_4,cashTotalAmt_0,cashTotalAmt_1,cashTotalAmt_2,cashTotalAmt_3,cashTotalAmt_4,cashTotalCnt_0,cashTotalCnt_1,cashTotalCnt_2,cashTotalCnt_3,cashTotalCnt_4,monthCardLargeAmt_0,monthCardLargeAmt_1,monthCardLargeAmt_2,monthCardLargeAmt_3,monthCardLargeAmt_4,noTransWeekPre_0,noTransWeekPre_1,noTransWeekPre_2,noTransWeekPre_4,onlineTransAmt_0,onlineTransAmt_1,onlineTransAmt_2,onlineTransAmt_3,onlineTransAmt_4,onlineTransCnt_0,onlineTransCnt_1,onlineTransCnt_2,onlineTransCnt_3,onlineTransCnt_4,publicPayAmt_0,publicPayAmt_1,publicPayAmt_2,publicPayAmt_3,publicPayAmt_4,publicPayCnt_0,publicPayCnt_1,publicPayCnt_2,publicPayCnt_3,publicPayCnt_4,transAmt_mean_0,transAmt_mean_1,transAmt_mean_2,transAmt_mean_3,transAmt_mean_4,transAmt_non_null_months_0,transAmt_non_null_months_1,transAmt_non_null_months_2,transAmt_non_null_months_3,transAmt_non_null_months_4,transCnt_mean_0,transCnt_mean_1,transCnt_mean_3,transCnt_mean_4,transCnt_non_null_months_0,transCnt_non_null_months_1,transCnt_non_null_months_2,transCnt_non_null_months_3,transCnt_non_null_months_4,transTotalAmt_0,transTotalAmt_1,transTotalAmt_2,transTotalAmt_3,transTotalAmt_4,transTotalCnt_0,transTotalCnt_1,transTotalCnt_3,transTotalCnt_4,trans_total_0,trans_total_1,trans_total_2,trans_total_3,trans_total_4,total_withdraw_0,total_withdraw_1,total_withdraw_2,total_withdraw_3,total_withdraw_4,avg_per_withdraw_0,avg_per_withdraw_1,avg_per_withdraw_2,avg_per_withdraw_3,avg_per_withdraw_4,avg_per_online_spend_0,avg_per_online_spend_1,avg_per_online_spend_2,avg_per_online_spend_3,avg_per_online_spend_4,avg_per_public_spend_0,avg_per_public_spend_1,avg_per_public_spend_2,avg_per_public_spend_3,avg_per_public_spend_4
count,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0,47332.0
mean,0.038431,0.037691,0.037395,0.016522,0.073523,0.089939,0.158286,0.523282,0.318432,0.063678,0.397638,0.291093,0.178801,0.068791,0.5446,0.168639,0.286762,0.073819,0.729042,0.197139,0.102806,0.072044,0.079354,0.146159,0.599637,0.003444,0.560466,0.43609,0.384264,0.408223,0.207513,0.177385,0.405645,0.260289,0.132152,0.024529,0.384074,0.384729,0.142905,0.060699,0.027592,0.991845,0.006951,0.000845,0.000254,0.000106,0.677808,0.152962,0.113792,0.035304,0.020134,0.990197,0.009106,0.000549,0.000127,2.1e-05,0.676392,0.152011,0.115693,0.035832,0.020071,0.99626,0.002979,0.000465,0.000169,0.000127,0.985486,0.012613,0.001394,0.000423,8.5e-05,0.997951,0.001732,0.000254,4.2e-05,2.1e-05,0.198766,0.291621,0.509592,2.1e-05,0.000317,0.001732,0.996345,0.001585,2.1e-05,0.998944,0.000866,0.000127,2.1e-05,4.2e-05,6.3e-05,0.000169,0.002493,0.996958,0.000317,0.998838,0.000803,0.000254,4.2e-05,6.3e-05,0.99812,0.001606,0.000148,4.2e-05,8.5e-05,0.300938,0.15835,0.195956,0.114722,0.230035,0.999915,2.1e-05,2.1e-05,4.2e-05,0.289128,0.166484,0.210386,0.111616,0.222387,0.998627,0.001225,0.000106,2.1e-05,2.1e-05,0.999768,0.000169,2.1e-05,4.2e-05,0.999683,0.000169,6.3e-05,6.3e-05,2.1e-05,0.999324,0.000359,0.000148,0.000148,2.1e-05,0.996324,0.002303,0.000866,0.000444,6.3e-05,0.000296,0.002916,0.996514,0.000254,2.1e-05,0.000106,0.000592,0.956942,0.04198,0.00038
std,0.192236,0.19045,0.189731,0.127471,0.260996,0.286098,0.365013,0.499463,0.465873,0.244181,0.489415,0.454271,0.383189,0.253101,0.498012,0.374436,0.452254,0.261479,0.444459,0.397843,0.303708,0.258564,0.270294,0.353269,0.489977,0.058583,0.496336,0.495904,0.486426,0.49151,0.40553,0.381998,0.491022,0.438797,0.338659,0.154686,0.486381,0.486536,0.34998,0.23878,0.163803,0.089938,0.083083,0.029059,0.015921,0.010278,0.467321,0.359955,0.317562,0.184549,0.140461,0.098525,0.09499,0.023431,0.011258,0.004596,0.467857,0.359036,0.31986,0.185873,0.140245,0.061038,0.054499,0.021554,0.013,0.011258,0.1196,0.111598,0.037316,0.020552,0.009193,0.045224,0.041587,0.015921,0.0065,0.004596,0.399076,0.454513,0.499913,0.004596,0.017799,0.041587,0.060347,0.039775,0.004596,0.032485,0.029419,0.011258,0.004596,0.0065,0.007961,0.013,0.049868,0.055074,0.017799,0.034069,0.028323,0.015921,0.0065,0.007961,0.043322,0.040039,0.01216,0.0065,0.009193,0.458671,0.365072,0.396939,0.318689,0.420859,0.009193,0.004596,0.004596,0.0065,0.453362,0.372518,0.407587,0.314897,0.415854,0.037033,0.034984,0.010278,0.004596,0.004596,0.015243,0.013,0.004596,0.0065,0.017799,0.013,0.007961,0.007961,0.004596,0.025993,0.018948,0.01216,0.01216,0.004596,0.06052,0.047934,0.029419,0.021059,0.007961,0.017196,0.053918,0.05894,0.015921,0.004596,0.010278,0.024315,0.202989,0.200546,0.019498
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
