In [1]:
# coding: utf-8
import os

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

from mydatools.features_analyze import get_top_k_corr

% matplotlib inline

## Config

In [2]:
trn_path = './data/input/train.csv'
tst_path = './data/input/test.csv'
id_col = 'id'
label_col = 'target'

submission_path = './data/output/submission.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
trn_df = pd.read_csv(trn_path)
trn_df['ds_type'] = 'train'

tst_df = pd.read_csv(tst_path)
tst_df['ds_type'] = 'test'

full_df = pd.concat([trn_df, tst_df])

del(trn_df)
del(tst_df)

full_df.head()

Unnamed: 0,ds_type,id,ps_calc_01,ps_calc_02,ps_calc_03,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,...,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,target
0,train,7,0.6,0.5,0.2,3,1,10,1,10,...,0,0,11,0,1,0,0.7,0.2,0.71807,0.0
1,train,9,0.3,0.1,0.3,2,1,9,5,8,...,0,0,3,0,0,1,0.8,0.4,0.766078,0.0
2,train,13,0.5,0.7,0.1,2,2,9,1,8,...,0,0,12,1,0,0,0.0,0.0,-1.0,0.0
3,train,16,0.6,0.9,0.1,2,4,7,1,8,...,0,0,8,1,0,0,0.9,0.2,0.580948,0.0
4,train,17,0.4,0.6,0.0,2,2,6,3,10,...,0,0,9,1,0,0,0.7,0.6,0.840759,0.0


In [4]:
# dataset type
is_train = full_df['ds_type'] == 'train'
is_test = full_df['ds_type'] == 'test'

## Features

In [5]:
feature_columns = []

def add_features(features):
    if not isinstance(features, list):
        features = [features]
    global feature_columns
    feature_columns.extend([f for f in features if f not in feature_columns])
    
def remove_features(features):
    if not isinstance(features, list):
        features = [features]
    global feature_columns
    feature_columns = [f for f in feature_columns if f not in features]

In [6]:
numerical_features = full_df.dtypes[full_df.dtypes != 'object'].index.tolist()
numerical_features = [c for c in numerical_features if c not in [id_col, label_col, 'ds_type']]
add_features(numerical_features)

* 空值处理

In [7]:
full_df.replace(-1, np.nan, inplace=True)

In [8]:
# import missingno as msno
# msno.matrix(full_df[feature_columns], labels=True)

In [9]:
# 去掉空值过多的特征
remove_features(['ps_car_03_cat', 'ps_car_05_cat'])

* 处理categorical类型特征

In [10]:
categorical_features = [c for c in feature_columns if c.endswith('cat')]

In [11]:
categorical_features_card_num = full_df[categorical_features].apply(lambda x: x.nunique())
categorical_features_card_num

ps_car_01_cat     12
ps_car_02_cat      2
ps_car_04_cat     10
ps_car_06_cat     18
ps_car_07_cat      2
ps_car_08_cat      2
ps_car_09_cat      5
ps_car_10_cat      3
ps_car_11_cat    104
ps_ind_02_cat      4
ps_ind_04_cat      2
ps_ind_05_cat      7
dtype: int64

In [12]:
# 高基数分类特征
# high_card_features = ['ps_car_11_cat']
high_card_features = categorical_features_card_num[categorical_features_card_num >= 10].index.tolist()
low_card_features = [f for f in categorical_features if f not in high_card_features]
print('high_card_features:', high_card_features)
print('low_card_features:', low_card_features)

high_card_features: ['ps_car_01_cat', 'ps_car_04_cat', 'ps_car_06_cat', 'ps_car_11_cat']
low_card_features: ['ps_car_02_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat']


* 高基数分类特征处理

  target encoding: http://www.saedsayad.com/encoding.htm

In [13]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [14]:
for f in high_card_features:
    trn, tst = target_encode(full_df[is_train][f], 
                             full_df[is_test][f],
                             full_df[is_train][label_col], 
                             min_samples_leaf=100,
                             smoothing=10,
                             noise_level=0.01)
    new_f = f+'_encode'
    full_df[new_f] = trn.append(tst, ignore_index=True)
    add_features(new_f)
    remove_features(f)

* 低基数分类特征

  one-hot处理

In [15]:
dummy_df = pd.get_dummies(full_df[low_card_features], columns=low_card_features)
remove_features(low_card_features)
add_features(dummy_df.columns.tolist())
full_df[dummy_df.columns] = dummy_df

## LightGBM

In [16]:
trn_df = full_df[is_train]
tst_df = full_df[is_test]

X = trn_df[feature_columns]
y = trn_df[label_col]
X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

X_tst = tst_df[feature_columns]

trn_lgb = lgb.Dataset(X_trn.values, y_trn, free_raw_data=False)
val_lgb = lgb.Dataset(X_val.values, y_val, free_raw_data=False, reference=trn_lgb)

* gini coef

In [17]:
def gini(y, pred):
    """
    gini = 2*roc_auc - 1
    """
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

* train

In [18]:
params = {
    'application': 'binary',
    'metric': 'auc',
    'learning_rate': 0.02,
    'is_unbalance': True,
#     'max_depth': 5,
#     'num_leaves': 25,
    'subsample': 0.8,
    'colsample_bytree': 0.5,
#     'reg_alpha': 0.1,
    'seed': 0,
}
bst = lgb.train(params, trn_lgb, 1000, 
                valid_sets=[trn_lgb, val_lgb], 
                early_stopping_rounds=20, 
                feval=gini_lgb,
                feature_name=feature_columns)

[1]	training's auc: 0.608775	training's gini: 0.217486	valid_1's auc: 0.582454	valid_1's gini: 0.164599
Training until validation scores don't improve for 20 rounds.
[2]	training's auc: 0.628274	training's gini: 0.256617	valid_1's auc: 0.601738	valid_1's gini: 0.203395
[3]	training's auc: 0.635112	training's gini: 0.270248	valid_1's auc: 0.610373	valid_1's gini: 0.220717
[4]	training's auc: 0.636021	training's gini: 0.272053	valid_1's auc: 0.612002	valid_1's gini: 0.223996
[5]	training's auc: 0.636691	training's gini: 0.273389	valid_1's auc: 0.612735	valid_1's gini: 0.225462
[6]	training's auc: 0.637231	training's gini: 0.274463	valid_1's auc: 0.612736	valid_1's gini: 0.22547
[7]	training's auc: 0.637215	training's gini: 0.274432	valid_1's auc: 0.612209	valid_1's gini: 0.224417
[8]	training's auc: 0.637635	training's gini: 0.275272	valid_1's auc: 0.613391	valid_1's gini: 0.226784
[9]	training's auc: 0.639169	training's gini: 0.278338	valid_1's auc: 0.614705	valid_1's gini: 0.229411
[10

[79]	training's auc: 0.653668	training's gini: 0.307336	valid_1's auc: 0.622864	valid_1's gini: 0.245728
[80]	training's auc: 0.653901	training's gini: 0.307802	valid_1's auc: 0.623021	valid_1's gini: 0.246042
[81]	training's auc: 0.654014	training's gini: 0.308029	valid_1's auc: 0.623029	valid_1's gini: 0.246057
[82]	training's auc: 0.654174	training's gini: 0.308349	valid_1's auc: 0.6231	valid_1's gini: 0.246201
[83]	training's auc: 0.654317	training's gini: 0.308633	valid_1's auc: 0.623197	valid_1's gini: 0.246395
[84]	training's auc: 0.654388	training's gini: 0.308776	valid_1's auc: 0.623246	valid_1's gini: 0.246492
[85]	training's auc: 0.654485	training's gini: 0.30897	valid_1's auc: 0.623228	valid_1's gini: 0.246456
[86]	training's auc: 0.654743	training's gini: 0.309486	valid_1's auc: 0.623447	valid_1's gini: 0.246894
[87]	training's auc: 0.654967	training's gini: 0.309934	valid_1's auc: 0.623533	valid_1's gini: 0.247067
[88]	training's auc: 0.65505	training's gini: 0.3101	valid

[157]	training's auc: 0.666153	training's gini: 0.332305	valid_1's auc: 0.626533	valid_1's gini: 0.253066
[158]	training's auc: 0.666263	training's gini: 0.332526	valid_1's auc: 0.62654	valid_1's gini: 0.253079
[159]	training's auc: 0.666413	training's gini: 0.332827	valid_1's auc: 0.626606	valid_1's gini: 0.253212
[160]	training's auc: 0.666564	training's gini: 0.333128	valid_1's auc: 0.626538	valid_1's gini: 0.253076
[161]	training's auc: 0.666724	training's gini: 0.333448	valid_1's auc: 0.626535	valid_1's gini: 0.25307
[162]	training's auc: 0.666874	training's gini: 0.333749	valid_1's auc: 0.62652	valid_1's gini: 0.253039
[163]	training's auc: 0.66709	training's gini: 0.33418	valid_1's auc: 0.626561	valid_1's gini: 0.253123
[164]	training's auc: 0.667255	training's gini: 0.334509	valid_1's auc: 0.6266	valid_1's gini: 0.2532
[165]	training's auc: 0.667366	training's gini: 0.334733	valid_1's auc: 0.62659	valid_1's gini: 0.253179
[166]	training's auc: 0.667529	training's gini: 0.335058

[235]	training's auc: 0.677585	training's gini: 0.35517	valid_1's auc: 0.628173	valid_1's gini: 0.256347
[236]	training's auc: 0.677699	training's gini: 0.355399	valid_1's auc: 0.628192	valid_1's gini: 0.256383
[237]	training's auc: 0.677854	training's gini: 0.355708	valid_1's auc: 0.62822	valid_1's gini: 0.25644
[238]	training's auc: 0.677959	training's gini: 0.355917	valid_1's auc: 0.628193	valid_1's gini: 0.256385
[239]	training's auc: 0.678092	training's gini: 0.356183	valid_1's auc: 0.628192	valid_1's gini: 0.256385
[240]	training's auc: 0.678196	training's gini: 0.356392	valid_1's auc: 0.62819	valid_1's gini: 0.25638
[241]	training's auc: 0.678336	training's gini: 0.356672	valid_1's auc: 0.628203	valid_1's gini: 0.256406
[242]	training's auc: 0.678498	training's gini: 0.356995	valid_1's auc: 0.628302	valid_1's gini: 0.256604
[243]	training's auc: 0.678642	training's gini: 0.357284	valid_1's auc: 0.628316	valid_1's gini: 0.256633
[244]	training's auc: 0.678783	training's gini: 0.3

[313]	training's auc: 0.687848	training's gini: 0.375697	valid_1's auc: 0.629455	valid_1's gini: 0.25891
[314]	training's auc: 0.68797	training's gini: 0.375941	valid_1's auc: 0.62947	valid_1's gini: 0.258939
[315]	training's auc: 0.688071	training's gini: 0.376141	valid_1's auc: 0.629417	valid_1's gini: 0.258834
[316]	training's auc: 0.688236	training's gini: 0.376472	valid_1's auc: 0.629415	valid_1's gini: 0.25883
[317]	training's auc: 0.688354	training's gini: 0.376708	valid_1's auc: 0.629423	valid_1's gini: 0.258845
[318]	training's auc: 0.688466	training's gini: 0.376933	valid_1's auc: 0.629424	valid_1's gini: 0.258849
[319]	training's auc: 0.688601	training's gini: 0.377201	valid_1's auc: 0.629413	valid_1's gini: 0.258827
[320]	training's auc: 0.688762	training's gini: 0.377523	valid_1's auc: 0.629403	valid_1's gini: 0.258807
[321]	training's auc: 0.688867	training's gini: 0.377734	valid_1's auc: 0.629407	valid_1's gini: 0.258814
[322]	training's auc: 0.688983	training's gini: 0.

[391]	training's auc: 0.697601	training's gini: 0.395201	valid_1's auc: 0.629802	valid_1's gini: 0.259603
[392]	training's auc: 0.697724	training's gini: 0.395448	valid_1's auc: 0.629809	valid_1's gini: 0.259619
[393]	training's auc: 0.697842	training's gini: 0.395684	valid_1's auc: 0.629811	valid_1's gini: 0.259623
Early stopping, best iteration is:
[373]	training's auc: 0.695438	training's gini: 0.390875	valid_1's auc: 0.629876	valid_1's gini: 0.259752


In [19]:
# bst = lgb.train(params, trn_lgb, 1000, valid_sets=[trn_lgb, val_lgb], early_stopping_rounds=20,
#                 init_model=bst, 
#                 learning_rates=lambda iter: 0.1 * (0.99 ** iter))

In [20]:
# imp_df = pd.DataFrame([bst.feature_importance()], columns=feature_columns, index=['importance']).T.sort_values(by='importance', ascending=False)
# imp_df

## predict

In [21]:
res_df = pd.DataFrame(bst.predict(X_tst), columns=[output_label_col])
res_df[output_id_col] = tst_df[output_id_col]
res_df[[output_id_col, output_label_col]].to_csv(submission_path, index=False)