# baseline
先按照一般的步骤完成一个baseline, 然后再去进行迭代.

baseline步骤:
- 赛题理解(已完成)
- 数据分析(在DataAnalysis中完成, 在这里进行总结, 并处理数据)
    - 对某些连续特征的离散化
    - 构建连续与离散特征, 主要是归一化和编码
    - 数据清理: 去除掉离群点和异常值
- 特征工程
    - 特征选择: 去掉对模型有害的特征
    - 构造特征: 主要在之后的迭代中完成
    - 提取统计特征
    - 特征组合
- 模型训练与验证
    - 数据集的划分(训练集, 测试集, 验证集): 应独立划分, 防止特征穿越
    - 模型训练: 调参
    - 线下评测
- 模型融合

## 数据分析
### 总结
- 特征分类
    - 连续特征: GRJCJS(个人缴存基数), GRZHYE(个人账户余额), GRZHSNJZYE(个人账户上年结转余额), GRZHDNGJYE(个人账户当年归集余额), GRYJCE(个人月缴存额), DWYJCE(单位月缴存额), DKFFE(贷款发放额), DKYE(贷款余额)
    - 离散特征: XINGBIE(性别), HYZK(婚姻状况), ZHIYE(职业), ZHICHEN(职称), ZHIWU(职务), XUELI(学历), DWJJLX(单位经济类型), DWSSHY(单位所属行业), GRZHZT(个人账户状态)
    - 其他特征: CSNY(出生年月), DKLL(贷款利率)
- 特征分布:
    - 测试集与训练集分布差异较大: ZHIWU(分布不同), DKLL(训练集为8个类,测试集为数值)
    - 类别特征中在测试集中只有一类: HYZK, XUELI 
    - 类别特征中单类占主导(训练集测试集均如此): ZHIYE, ZHICHEN, GRZHZT

### 数据处理
-  CSNY: 先又时间戳转换为年份, 再把1971和大于2010的放在一类, 其他的进行分桶, 转换成了类别特征
- DKLL:
- 去掉特征: HYZK, XUELI
- 单类占主导的将所有少类合并: ZHIYE, ZHICHEN 
- 大概率为1的, 可以直接使用规则方法: GRZHZT
- 对连续特征编码


In [4]:
import numpy as np
import pandas as pd
pd.set_option("max_columns", 100)
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import lightgbm as lgb
import time

In [13]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
submit = pd.read_csv('./data/submit.csv')

In [37]:
train.shape, test.shape

((40000, 21), (15000, 20))

In [40]:
train.tail(10)

Unnamed: 0,id,XINGBIE,CSNY,HYZK,ZHIYE,ZHICHEN,ZHIWU,XUELI,DWJJLX,DWSSHY,GRJCJS,GRZHZT,GRZHYE,GRZHSNJZYE,GRZHDNGJYE,GRYJCE,DWYJCE,DKFFE,DKYE,DKLL,label
39990,train_39990,2,586364400,90,90,999,0,99,900,15,3637.0,1,7477.68,19378.33,7362.525,645.0,645.0,150237,237.0,2.979,0
39991,train_39991,2,468086400,90,90,999,0,99,150,2,2237.0,1,10114.8,6797.55,1037.0,337.0,337.0,300237,284237.0,2.979,0
39992,train_39992,1,715276800,90,90,999,0,99,150,3,1192.0,1,237.0,10776.675,-159.13,284.75,284.75,135237,131149.41,2.708,0
39993,train_39993,1,78768000,90,90,999,0,99,160,2,1887.0,1,14619.845,11789.395,897.0,319.5,319.5,125237,116250.01,2.708,0
39994,train_39994,1,604684800,90,90,999,0,99,150,14,5135.0,1,20844.42,16506.84,-4796.985,824.76,824.76,150237,148496.995,2.708,0
39995,train_39995,1,573148800,90,90,999,0,99,110,16,3185.0,1,82865.575,65304.37,4026.6,590.76,590.76,150237,117816.68,2.708,0
39996,train_39996,2,57772800,90,90,999,0,99,110,14,1660.5,1,32811.585,27065.15,1603.56,407.82,407.82,300237,267615.065,2.979,0
39997,train_39997,2,673023600,90,90,999,0,99,143,9,3923.0,1,8875.695,1309.89,3185.8,605.6,605.6,175237,237.0,2.708,0
39998,train_39998,1,536428800,90,90,999,0,99,150,6,3527.0,1,2079.4,252.875,1614.49,467.3,467.3,125237,113068.9,2.708,1
39999,train_39999,1,31507200,90,90,999,0,99,110,14,4934.5,1,26210.635,27266.08,-4994.61,800.7,800.7,300237,287737.0,2.979,0


## 数据分析与处理

In [55]:
# 先把时间戳转换成年份, 再进行分箱
data = pd.concat([train, test], axis=0)
data = data.reset_index(drop=True)

data['CSNY'] = data['CSNY'].apply(lambda x: int(time.strftime("%Y", time.localtime(x))))

def divide_bin(x):
    if (x == 1971) | (x >= 2010):
        return 0
    if (x>1971) & (x<=1978):
        return 1
    if (x>1978) & (x<=1982):
        return 2
    if (x>1982) & (x<=1985):
        return 3
    if (x>1985) & (x<=1987):
        return 4
    if (x>1987) & (x<=1991):
        return 5
    if (x>1991) & (x<=1995):
        return 6
    if (x>1995) & (x<2010):
        return 7

data['CSNY'] = data['CSNY'].apply(divide_bin)

In [59]:
# 对ZHIYE和ZHICHEN特征进行少类合并
def combine_zhiye(x):
    if x == 90:
        return 0
    else:
        return 1

def combine_zhichen(x):
    if x == 999:
        return 0
    else:
        return 1

def combine_grzhzt(x):
    if x == 1:
        return 0
    else:
        return 1

data['ZHIYE'] = data['ZHIYE'].apply(combine_zhiye)
data['ZHICHEN'] = data['ZHICHEN'].apply(combine_zhichen)
data['GRZHZT'] = data['GRZHZT'].apply(combine_grzhzt)

In [62]:
data = data.drop(['HYZK', 'XUELI'], axis=1)

In [63]:
data['ZHIYE'].value_counts(), data['ZHICHEN'].value_counts(), data['GRZHZT'].value_counts()

(1    55000
 Name: ZHIYE, dtype: int64,
 1    55000
 Name: ZHICHEN, dtype: int64,
 0    54773
 1      227
 Name: GRZHZT, dtype: int64)

In [64]:
data.head(10)

Unnamed: 0,id,XINGBIE,CSNY,ZHIYE,ZHICHEN,ZHIWU,DWJJLX,DWSSHY,GRJCJS,GRZHZT,GRZHYE,GRZHSNJZYE,GRZHDNGJYE,GRYJCE,DWYJCE,DKFFE,DKYE,DKLL,label
0,train_0,1,7,1,1,0,150,12,1737.0,0,3223.515,801.31,837.0,312.0,312.0,175237,154112.935,2.708,0.0
1,train_1,2,4,1,1,0,110,0,4894.0,0,18055.195,53213.22,1065.2,795.84,795.84,300237,298252.945,2.979,0.0
2,train_2,1,6,1,1,0,150,9,10297.0,0,27426.6,13963.14,7230.02,1444.2,1444.2,150237,147339.13,2.708,0.0
3,train_3,1,3,1,1,0,150,7,10071.5,0,111871.13,99701.265,2271.295,1417.14,1417.14,350237,300653.78,2.708,0.0
4,train_4,2,4,1,1,0,900,14,2007.0,0,237.0,11028.875,35.78,325.5,325.5,150237,145185.01,2.708,0.0
5,train_5,1,3,1,1,0,150,14,1192.0,0,9648.315,7388.55,771.8,303.85,303.85,150237,149743.21,2.708,0.0
6,train_6,1,4,1,1,0,160,2,7297.0,0,22846.975,12179.565,5885.0,801.8,801.8,150237,146723.725,2.708,0.0
7,train_7,1,7,1,1,0,150,7,1399.0,0,1808.135,6405.055,1050.4,318.34,318.34,125237,64938.955,2.708,0.0
8,train_8,2,5,1,1,0,150,9,6408.0,0,79304.635,52365.82,6161.16,977.52,977.52,300237,270579.955,2.979,0.0
9,train_9,2,3,1,1,0,150,14,3573.5,0,2009.0,61126.175,536.75,637.38,637.38,230237,142296.64,2.708,0.0


In [69]:
# 确定离散特征和连续特征, 并调整特征的位置
sparse_feat = ['CSNY', 'XINGBIE', 'ZHIYE', 'ZHICHEN', 'ZHIWU', 'DWJJLX', 'DWSSHY', 'GRZHZT']
dense_feat = ['GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE', 'DWYJCE', 'DKFFE', 'DKYE', 'DKLL']
feat_list = sparse_feat + dense_feat
data = data[['id'] + feat_list + ['label']]
data.shape

(55000, 19)

In [70]:
# 对连续特征编码
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data[sparse_feat] = data[sparse_feat].apply(encoder.fit_transform)

## 特征工程

## 模型训练

In [None]:
# 设置一个参数字典, 方便之后记录
args = {}

In [91]:
# 进行数据集划分, 先拆开训练集和测试集, 再从训练集中分离训练集和验证集
train_ = data.iloc[:40000]
test_ = data.iloc[40000:]
test_ = test_.reset_index(drop=True)
trainX = train_[feat_list]
trainY = train_['label'].values
testX = test_[feat_list]

In [94]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from utils import evaluation, submit

#选择StratifiedKFold的原因是标签分布不均衡, 因此需要根据分布来划分
SKF = StratifiedKFold(n_splits=5, shuffle=True, random_state=723)

params={
    'learning_rate': 0.05,
    'num_leaves': 31,
    'num_iterations': 10000, 
    'metric': None, 
    'objective': 'binary',
    'random_state': 723, 
    'subsample': 0.8
}

i = 1  # to record the index of k-fold split
for train_index, val_index in SKF.split(trainX, trainY):
    print('---------------%d fold----------------' % i)
    X_train, X_val, Y_train, Y_val = trainX.iloc[train_index].reset_index(drop=True), trainX.iloc[val_index].reset_index(drop=True), trainY[train_index], trainY[val_index]
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, Y_train, eval_set=[(X_val, Y_val)], eval_metric='auc', early_stopping_rounds=200, verbose=200, categorical_feature=sparse_feat)
    y_ = model.predict_proba(X_train)
    i += 1
    print('AUC Score: %.5f' % roc_auc_score(Y_train, y_))
    print('TPR Score: %.5f' % evaluation(Y_train, y_))
    

---------------1 fold----------------


NameError: name 'iloc' is not defined