In [1]:
import pandas as pd
from tqdm import tqdm
import warnings
import gc
import os
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from collections import OrderedDict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import time
from itertools import combinations
from catboost import CatBoostClassifier

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')



In [2]:
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

In [3]:
seed = 2048

In [4]:
df_train = pd.read_csv('/home/mw/input/pre8881/train.csv')
# df_test = pd.read_csv('/home/mw/input/pretest_a3048/test_a.csv')
df_test_b = pd.read_csv('/home/mw/input/pretest_b6354/test_b.csv')
df_train['flag'], df_test_b['flag'] = 0, 2
df_train.shape, df_test_b.shape

((684283, 66), (80110, 65))

In [5]:
df_feature = pd.concat([df_train,df_test_b]).reset_index(drop=True)
df_feature = reduce_mem(df_feature)

del df_train, df_test_b
gc.collect()
df_feature.head()

-- Mem. usage decreased to 217.24 Mb (43.6% reduction),time spend:0.01 min


Unnamed: 0,client_no,dpt,xz,xb,carid,nprem_ly,ncd_ly,newvalue,bi_renewal_year,clmnum,regdate,trademark_cn,brand_cn,make_cn,series,capab,seats,use_type,change_owner,nprem_od,si_od,nprem_tp,si_tp,nprem_bt,si_bt,nprem_vld,si_vld,nprem_vlp,si_vlp,p1_prior_days_to_insure,suiche_nonauto_nprem_20,suiche_nonauto_nprem_19,suiche_nonauto_nprem_18,suiche_nonauto_nprem_17,suiche_nonauto_nprem_16,suiche_nonauto_amount_20,suiche_nonauto_amount_19,suiche_nonauto_amount_18,suiche_nonauto_amount_17,suiche_nonauto_amount_16,num_notcar_claim,p1_gender,p1_age,p1_census_register,p2_marital_status,f1_child_flag,f2_posses_house_flag,f2_cust_housing_price_total,p2_client_grade,w1_pc_wx_use_flag,p1_is_bank_eff,p2_is_enterprise_owner,p2_is_smeowner,active_7,active_30,active_90,active_365,p2_is_child_under_15_family,p2_is_adult_over_55_family,birth_month,p1_service_offer_cnt,p3_service_use_cnt,dur_personal_insurance_90,service_score_available,y1_is_purchase,flag
0,5gDljzjQ61m/QeU2tZBgDA==,217,商交,主全,WY4N+MOjfIx8wJ3j6GhlA4qEfL71brEUkqbB0SSdqkI=,1391.0,0.600098,88900,6,0,2010-02-09 00:00:00,福特,长安福特马自达,福特CAF7152A轿车,嘉年华,1.498047,5,非营业,非过户投保,379.0,24892.0,239.25,1000000,0.0,0.0,3.75,10000,9.507812,10000,30.0,,,,,,,,,,,,jh4mxXNEalwumcCWUJdnBw==,56.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,0.0,车主俱乐部-钻石客户-2,是,否,,,0.0,0.0,3.0,32.0,否,是,6月,0.0,0.0,,0.0,0.0,0
1,qTsiFUfrw8gwVOM+LftPvA==,217,商交,主全,DXMuODygH0ddFea7SIoAOhF4134Bx4TPvkkPi6WCdzU=,3740.0,1.0,303000,6,0,2007-02-02 00:00:00,丰田,天津丰田,丰田TV7250RoyalA轿车,皇冠,2.496094,5,非营业,非过户投保,1993.0,60600.0,505.5,1000000,72.375,60600.0,0.0,0,0.0,0,27.0,390.0,0.0,0.0,170.0,170.0,2795000.0,0.0,0.0,120000.0,120000.0,,jh4mxXNEalwumcCWUJdnBw==,52.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,0.0,车主俱乐部-钻石客户-2,是,否,是,是,2.0,4.0,11.0,40.0,否,否,9月,0.0,0.0,,10.0,1.0,0
2,vfTADBw3uqyLukTz5juO0g==,217,商交,主全,waWCEYZJqj9PYxFdVeVLkpCNf/n0BdXPFi1iHlk0WWk=,1454.0,0.600098,132800,6,0,2007-01-01 00:00:00,长城,长城汽车,长城CC6460KM60旅行车,哈弗,2.771484,5,非营业,非过户投保,434.75,26560.0,251.375,1000000,0.0,0.0,0.0,0,0.0,0,30.0,350.0,0.0,0.0,0.0,0.0,1695000.0,0.0,0.0,0.0,0.0,,jh4mxXNEalwumcCWUJdnBw==,55.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,154.625,车主俱乐部-钻石客户-2,否,是,否,否,0.0,0.0,9.0,17.0,否,否,2月,0.0,0.0,,16.0,1.0,0
3,zP5cmQ2nwzLbvocQPmf2YA==,217,商交,主全,nyRm/VviYGDpy2errRWE206SaYkVuqeclusAtXEU9v8=,3526.0,0.850098,316800,5,2,2015-02-09 00:00:00,奥迪,一汽大众,奥迪FV7201BACBG轿车,A6,1.984375,5,非营业,非过户投保,1651.0,202752.0,381.0,1000000,141.625,202752.0,17.921875,30000,45.46875,30000,27.0,9.898438,19.90625,0.0,170.0,0.0,1000000.0,1000000.0,0.0,120000.0,0.0,,jh4mxXNEalwumcCWUJdnBw==,47.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,208.0,车主俱乐部-钻石客户-2,是,是,是,是,0.0,0.0,1.0,7.0,否,否,7月,0.0,0.0,151786.0,0.0,1.0,0
4,+ruD5NLealUAfMZPQd6LEw==,217,单交,单交,LacSDMaoqD0AJRqCeYaGUu343r4NQiVuiFc9hyjLcMI=,522.5,,247800,3,0,2017-12-12 00:00:00,大众,上汽大众,大众汽车SVW6474CED多用途乘用车,途观,1.797852,5,非营业,非过户投保,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,yUh7960km3oydK6Km9rqRA==,52.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,否,是,0.0,车主俱乐部-钻石客户-2,是,否,是,是,0.0,1.0,11.0,30.0,否,否,11月,0.0,0.0,,1.0,1.0,0


## 数据处理

In [6]:
# 异常值，长尾特征处理
# df_feature['p1_prior_days_to_insure'] = df_feature['p1_prior_days_to_insure'].apply(lambda x: x if x>=0 and x<=90 else 30)
# df_feature['p1_age'] = df_feature['p1_age'].apply(lambda x: 80 if x>=80 else x)
# df_feature['p1_age'] = df_feature['p1_age'].apply(lambda x: 18 if x<=18 else x)
# df_feature['p1_service_offer_cnt'] = df_feature['p1_service_offer_cnt'].apply(lambda x: 0 if x>=100 or x<0 else x)
# df_feature['p1_service_offer_cnt'] = df_feature['p1_service_offer_cnt'].apply(lambda x: 25 if x>=25 else x)
# df_feature['p1_service_offer_cnt'] = df_feature['p1_service_offer_cnt'].apply(lambda x: 20 if x>=20 and x<25 else x)
# df_feature['p3_service_use_cnt'] = df_feature['p3_service_use_cnt'].apply(lambda x: 0 if x>=80 or x<0 else x)
# df_feature['p3_service_use_cnt'] = df_feature['p3_service_use_cnt'].apply(lambda x: 15 if x>=15 else x)
# df_feature['p3_service_use_cnt'] = df_feature['p3_service_use_cnt'].apply(lambda x: 10 if x>=10 and x<15 else x)
# df_feature['p2_client_grade'].value_counts()

In [7]:
df_feature.loc[df_feature['p2_client_grade'].isna(), 'p2_client_grade'] = 0
df_feature.loc[df_feature['p2_client_grade']=='车主俱乐部-黑钻客户-2', 'p2_client_grade'] = 1
df_feature.loc[df_feature['p2_client_grade']=='车主俱乐部-钻石客户-2', 'p2_client_grade'] = 2
df_feature.loc[df_feature['p2_client_grade']=='车主俱乐部-铂金客户-2', 'p2_client_grade'] = 3
df_feature.loc[df_feature['p2_client_grade']=='车主俱乐部-黄金客户-2', 'p2_client_grade'] = 4
df_feature.loc[df_feature['p2_client_grade']=='车主俱乐部-白银客户-2', 'p2_client_grade'] = 5
df_feature.loc[df_feature['p2_client_grade']=='车主俱乐部-黄铜客户-2', 'p2_client_grade'] = 6
df_feature['p2_client_grade'] = df_feature['p2_client_grade'].astype('int')

In [8]:
df_feature['make_cn_cartype'] = df_feature['make_cn'].str.replace('([A-Za-z0-9()-ⅡⅢ]+)', ' ')
df_feature['make_cn_cartype_0'] = df_feature['make_cn_cartype'].apply(lambda x: x.split(' ')[0])
df_feature['make_cn_cartype_1'] = df_feature['make_cn_cartype'].apply(lambda x: x.split(' ')[-1])
print(df_feature['make_cn'].nunique(), df_feature['make_cn_cartype'].nunique(), df_feature['make_cn_cartype_0'].nunique(), df_feature['make_cn_cartype_1'].nunique())
df_feature[['brand_cn', 'make_cn_cartype_0', 'make_cn_cartype_1']][:5]

11615 1035 449 60


Unnamed: 0,brand_cn,make_cn_cartype_0,make_cn_cartype_1
0,长安福特马自达,福特,轿车
1,天津丰田,丰田,轿车
2,长城汽车,长城,旅行车
3,一汽大众,奥迪,轿车
4,上汽大众,大众汽车,多用途乘用车


In [9]:
# 老人或小孩家庭
df_feature['p2_is_child_old_family'] = '否'
df_feature['p2_is_child_old_family']=np.where((df_feature['p2_is_child_under_15_family'] == '是') | (df_feature['p2_is_adult_over_55_family'] == '是'),'是',df_feature['p2_is_child_old_family'])
df_feature['p2_is_child_old_family'].value_counts()

否    656986
是    107407
Name: p2_is_child_old_family, dtype: int64

In [10]:
df_feature['personas'] = df_feature['p1_gender'].astype(str)+'_'+df_feature['p2_marital_status'].astype(str)+'_'+df_feature['f1_child_flag'].astype(str)+'_'+\
                        df_feature['f2_posses_house_flag'].astype(str)+'_'+df_feature['w1_pc_wx_use_flag'].astype(str)+'_'+df_feature['p1_is_bank_eff'].astype(str)
df_feature['xz_xb_co_15_55'] = df_feature['xz'].astype(str)+'_'+df_feature['xb'].astype(str)+'_'+df_feature['change_owner'].astype(str)+'_'+\
                        df_feature['p2_is_child_under_15_family'].astype(str)+'_'+df_feature['p2_is_adult_over_55_family'].astype(str)
df_feature['personas_cno_count'] = df_feature.groupby('personas')['client_no'].transform('count') 
df_feature['xz_xb_co_15_55_cno_count'] = df_feature.groupby('xz_xb_co_15_55')['client_no'].transform('count') 

In [11]:
df_feature['regdays'] = (pd.to_datetime('2021-1-31') - pd.to_datetime(df_feature['regdate'])) / pd.Timedelta(days=1)
df_feature['npremly_car_value_ratio'] = df_feature['nprem_ly'] / df_feature['newvalue']
df_feature['p3_service_use_ratio'] = df_feature['p3_service_use_cnt'] / df_feature['p1_service_offer_cnt']
df_feature['car_housing_value_ratio'] = df_feature['newvalue'] / df_feature['f2_cust_housing_price_total']
# df_feature.head()

In [12]:
df_feature['od_ratio'] = df_feature['nprem_od'] / df_feature['si_od']
df_feature['tp_ratio'] = df_feature['nprem_tp'] / df_feature['si_tp']
df_feature['bt_ratio'] = df_feature['nprem_bt'] / df_feature['si_bt'] # just so so
df_feature['vld_ratio'] = df_feature['nprem_vld'] / df_feature['si_vld']
df_feature['vlp_ratio'] = df_feature['nprem_vlp'] / df_feature['si_vlp']
df_feature['od_tp_ratio_2ord'] = df_feature['od_ratio'] / df_feature['tp_ratio']
df_feature['od_tp_ratio_2diff'] = df_feature['od_ratio'] - df_feature['tp_ratio']

df_feature['nprem_tot'] = df_feature['nprem_od'] + df_feature['nprem_tp'] + df_feature['nprem_bt'] + df_feature['nprem_vld'] + df_feature['nprem_vlp']
df_feature['nprem_od_percent'] = df_feature['nprem_od'] / df_feature['nprem_tot']
df_feature['nprem_tp_percent'] = df_feature['nprem_tp'] / df_feature['nprem_tot']
df_feature['nprem_odtp_percent_diff'] = df_feature['nprem_od_percent'] - df_feature['nprem_tp_percent']
df_feature['nprem_odtp_percent_add'] = (df_feature['nprem_od'] + df_feature['nprem_tp']) / df_feature['nprem_tot']

df_feature['nprem_20diff'] = df_feature['nprem_tot'] - df_feature['suiche_nonauto_nprem_20']
df_feature['nprem_20ratio'] = df_feature['nprem_tot'] / df_feature['suiche_nonauto_nprem_20']
df_feature['nprem_lydiff'] = df_feature['nprem_tot'] - df_feature['nprem_ly']
df_feature['nprem_lyratio'] = df_feature['nprem_tot'] / df_feature['nprem_ly']

df_feature['npremtot_car_value_ratio'] = df_feature['nprem_tot'] / df_feature['newvalue']
df_feature['npremtot_housing_value_ratio'] = df_feature['nprem_tot'] / df_feature['f2_cust_housing_price_total']

In [13]:
df_feature['suiche_nonauto_nprem_20_ydiff'] = df_feature['suiche_nonauto_nprem_20'] - df_feature['suiche_nonauto_nprem_19']
df_feature['suiche_nonauto_nprem_19_2ydiff'] = df_feature['suiche_nonauto_nprem_19'] - df_feature['suiche_nonauto_nprem_17']
df_feature['suiche_nonauto_nprem_17_ydiff'] = df_feature['suiche_nonauto_nprem_17'] - df_feature['suiche_nonauto_nprem_16']
df_feature['suiche_nonauto_nprem_20_yratio'] = df_feature['suiche_nonauto_nprem_20'] / df_feature['suiche_nonauto_nprem_19']
df_feature['suiche_nonauto_nprem_19_2yratio'] = df_feature['suiche_nonauto_nprem_19'] / df_feature['suiche_nonauto_nprem_17']
df_feature['suiche_nonauto_nprem_17_yratio'] = df_feature['suiche_nonauto_nprem_17'] / df_feature['suiche_nonauto_nprem_16']
df_feature['suiche_nonauto_nprem_20_19_yratio_2ord'] = df_feature['suiche_nonauto_nprem_20_yratio'] / df_feature['suiche_nonauto_nprem_19_2yratio']
df_feature['suiche_nonauto_nprem_20_17_yratio_2ord'] = df_feature['suiche_nonauto_nprem_20_yratio'] / df_feature['suiche_nonauto_nprem_17_yratio']

df_feature['suiche_nonauto_nprem_20_ratio'] = df_feature['suiche_nonauto_amount_20'] / df_feature['suiche_nonauto_nprem_20']
df_feature['suiche_nonauto_nprem_19_ratio'] = df_feature['suiche_nonauto_amount_19'] / df_feature['suiche_nonauto_nprem_19']
# df_feature['suiche_nonauto_nprem_18_ratio'] = df_feature['suiche_nonauto_amount_18'] / df_feature['suiche_nonauto_nprem_18'] #该年数据有异常
df_feature['suiche_nonauto_nprem_17_ratio'] = df_feature['suiche_nonauto_amount_17'] / df_feature['suiche_nonauto_nprem_17']
df_feature['suiche_nonauto_nprem_16_ratio'] = df_feature['suiche_nonauto_amount_16'] / df_feature['suiche_nonauto_nprem_16']
df_feature['suiche_nonauto_nprem_20_19_ratio_2diff'] = df_feature['suiche_nonauto_nprem_20_ratio'] - df_feature['suiche_nonauto_nprem_19_ratio']
df_feature['suiche_nonauto_nprem_20_19_ratio_2ord'] = df_feature['suiche_nonauto_nprem_20_ratio'] / df_feature['suiche_nonauto_nprem_19_ratio']
df_feature['suiche_nonauto_nprem_19_17_ratio_2diff'] = df_feature['suiche_nonauto_nprem_19_ratio'] - df_feature['suiche_nonauto_nprem_17_ratio']
df_feature['suiche_nonauto_nprem_19_17_ratio_2ord'] = df_feature['suiche_nonauto_nprem_19_ratio'] / df_feature['suiche_nonauto_nprem_17_ratio']
df_feature['suiche_nonauto_nprem_20_17_ratio_3ord'] = df_feature['suiche_nonauto_nprem_20_19_ratio_2ord'] / df_feature['suiche_nonauto_nprem_19_17_ratio_2ord']
df_feature['suiche_nonauto_nprem_20_17_ratio_3diff'] = df_feature['suiche_nonauto_nprem_20_19_ratio_2diff'] - df_feature['suiche_nonauto_nprem_19_17_ratio_2diff']

df_feature['active_7_30_ratio'] = (df_feature['active_7'] / 7) / (df_feature['active_30'] / 30) # just so so
df_feature['active_30_90_ratio'] = (df_feature['active_30'] / 30) / (df_feature['active_90'] / 90)
df_feature['active_30_90_diff'] = (df_feature['active_30'] / 30) - (df_feature['active_90'] / 90)
df_feature['active_30_365_ratio'] = (df_feature['active_30'] / 30) / (df_feature['active_365'] / 365)
df_feature['active_30_365_diff'] = (df_feature['active_30'] / 30) - (df_feature['active_365'] / 365)
df_feature['active_90_365_ratio'] = (df_feature['active_90'] / 90) / (df_feature['active_365'] / 365)
df_feature['active_90_365_diff'] = (df_feature['active_90'] / 90) - (df_feature['active_365'] / 365)

In [17]:
# 计数 count编码
count_fea_list = ['p1_census_register', 'trademark_cn', 'brand_cn', 'series', 'capab', 'make_cn', 'make_cn_cartype', 'make_cn_cartype_0', 'make_cn_cartype_1']
count_fea_list += ['tp_ratio', 'p1_age', 'service_score_available', 'nprem_tp', 'suiche_nonauto_amount_20', 'suiche_nonauto_amount_19', 'nprem_ly', 'p1_prior_days_to_insure']
count_fea_list += ['personas', 'xz_xb_co_15_55']

for f in count_fea_list:
    # df_temp = df_feature.groupby(f).size().reset_index()
    # df_temp.columns = f + ['{}_count'.format('_'.join(f))]
    # df_feature = df_feature.merge(df_temp, how='left')
    # # 等价写法
    df_feature[f + '_count'] = df_feature[f].map(df_feature[f].value_counts())

In [18]:
df_feature['birth_month'] = df_feature['birth_month'].apply(lambda x: int(x[:-1]) if type(x) != float else 0)
df_feature['reg_year'] = df_feature['regdate'].apply(lambda x: int(x[:4]) if type(x) != float else 0)
df_feature['reg_month'] = df_feature['regdate'].apply(lambda x: int(x[5:7]) if type(x) != float else 0)
df_feature['reg_day'] = df_feature['regdate'].apply(lambda x: int(x[8:9]) if type(x) != float else 0)

In [19]:
df_feature[~df_feature['y1_is_purchase'].isnull()].head()

Unnamed: 0,client_no,dpt,xz,xb,carid,nprem_ly,ncd_ly,newvalue,bi_renewal_year,clmnum,regdate,trademark_cn,brand_cn,make_cn,series,capab,seats,use_type,change_owner,nprem_od,si_od,nprem_tp,si_tp,nprem_bt,si_bt,nprem_vld,si_vld,nprem_vlp,si_vlp,p1_prior_days_to_insure,suiche_nonauto_nprem_20,suiche_nonauto_nprem_19,suiche_nonauto_nprem_18,suiche_nonauto_nprem_17,suiche_nonauto_nprem_16,suiche_nonauto_amount_20,suiche_nonauto_amount_19,suiche_nonauto_amount_18,suiche_nonauto_amount_17,suiche_nonauto_amount_16,num_notcar_claim,p1_gender,p1_age,p1_census_register,p2_marital_status,f1_child_flag,f2_posses_house_flag,f2_cust_housing_price_total,p2_client_grade,w1_pc_wx_use_flag,p1_is_bank_eff,p2_is_enterprise_owner,p2_is_smeowner,active_7,active_30,active_90,active_365,p2_is_child_under_15_family,p2_is_adult_over_55_family,birth_month,p1_service_offer_cnt,p3_service_use_cnt,dur_personal_insurance_90,service_score_available,y1_is_purchase,flag,make_cn_cartype,make_cn_cartype_0,make_cn_cartype_1,p2_is_child_old_family,personas,xz_xb_co_15_55,personas_cno_count,xz_xb_co_15_55_cno_count,regdays,npremly_car_value_ratio,p3_service_use_ratio,car_housing_value_ratio,od_ratio,tp_ratio,bt_ratio,vld_ratio,vlp_ratio,od_tp_ratio_2ord,od_tp_ratio_2diff,nprem_tot,nprem_od_percent,nprem_tp_percent,nprem_odtp_percent_diff,nprem_odtp_percent_add,nprem_20diff,nprem_20ratio,nprem_lydiff,nprem_lyratio,npremtot_car_value_ratio,npremtot_housing_value_ratio,suiche_nonauto_nprem_20_ydiff,suiche_nonauto_nprem_19_2ydiff,suiche_nonauto_nprem_17_ydiff,suiche_nonauto_nprem_20_yratio,suiche_nonauto_nprem_19_2yratio,suiche_nonauto_nprem_17_yratio,suiche_nonauto_nprem_20_19_yratio_2ord,suiche_nonauto_nprem_20_17_yratio_2ord,suiche_nonauto_nprem_20_ratio,suiche_nonauto_nprem_19_ratio,suiche_nonauto_nprem_17_ratio,suiche_nonauto_nprem_16_ratio,suiche_nonauto_nprem_20_19_ratio_2diff,suiche_nonauto_nprem_20_19_ratio_2ord,suiche_nonauto_nprem_19_17_ratio_2diff,suiche_nonauto_nprem_19_17_ratio_2ord,suiche_nonauto_nprem_20_17_ratio_3ord,suiche_nonauto_nprem_20_17_ratio_3diff,active_7_30_ratio,active_30_90_ratio,active_30_90_diff,active_30_365_ratio,active_30_365_diff,active_90_365_ratio,active_90_365_diff,p1_census_register_count,trademark_cn_count,brand_cn_count,series_count,capab_count,make_cn_count,make_cn_cartype_count,make_cn_cartype_0_count,make_cn_cartype_1_count,tp_ratio_count,p1_age_count,service_score_available_count,nprem_tp_count,suiche_nonauto_amount_20_count,suiche_nonauto_amount_19_count,nprem_ly_count,p1_prior_days_to_insure_count,personas_count,xz_xb_co_15_55_count,reg_year,reg_month,reg_day
0,5gDljzjQ61m/QeU2tZBgDA==,217,商交,主全,WY4N+MOjfIx8wJ3j6GhlA4qEfL71brEUkqbB0SSdqkI=,1391.0,0.600098,88900,6,0,2010-02-09 00:00:00,福特,长安福特马自达,福特CAF7152A轿车,嘉年华,1.498047,5,非营业,非过户投保,379.0,24892.0,239.25,1000000,0.0,0.0,3.75,10000,9.507812,10000,30.0,,,,,,,,,,,,jh4mxXNEalwumcCWUJdnBw==,56.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,0.0,2,是,否,,,0.0,0.0,3.0,32.0,否,是,6,0.0,0.0,,0.0,0.0,0,福特 轿车,福特,轿车,是,jh4mxXNEalwumcCWUJdnBw==_eNP+WqbTmmD3bj49nIcSe...,商交_主全_非过户投保_否_是,12800,50781,4009.0,0.015647,,inf,0.015226,0.000239,,0.000375,0.000951,63.639605,0.014987,631.5,0.600098,0.378906,0.221191,0.978516,,,-759.5,0.454102,0.007103,inf,,,,,,,,,,,,,,,,,,,,0.0,-0.033325,0.0,-0.087671,0.380116,-0.054346,6193.0,29781,15325,2014.0,45294,60,7207,8583,518484,361.0,9339.0,345003.0,363,,,123,130634.0,12800,50781,2010,2,0
1,qTsiFUfrw8gwVOM+LftPvA==,217,商交,主全,DXMuODygH0ddFea7SIoAOhF4134Bx4TPvkkPi6WCdzU=,3740.0,1.0,303000,6,0,2007-02-02 00:00:00,丰田,天津丰田,丰田TV7250RoyalA轿车,皇冠,2.496094,5,非营业,非过户投保,1993.0,60600.0,505.5,1000000,72.375,60600.0,0.0,0,0.0,0,27.0,390.0,0.0,0.0,170.0,170.0,2795000.0,0.0,0.0,120000.0,120000.0,,jh4mxXNEalwumcCWUJdnBw==,52.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,0.0,2,是,否,是,是,2.0,4.0,11.0,40.0,否,否,9,0.0,0.0,,10.0,1.0,0,丰田 轿车,丰田,轿车,否,jh4mxXNEalwumcCWUJdnBw==_eNP+WqbTmmD3bj49nIcSe...,商交_主全_非过户投保_否_否,12800,418644,5112.0,0.012343,,inf,0.032888,0.000505,0.001194,,,65.059922,0.032382,2570.0,0.775391,0.196655,0.578613,0.972168,2180.0,6.589844,-1170.0,0.687012,0.008482,inf,390.0,-170.0,0.0,inf,0.0,1.0,inf,inf,7166.666504,,705.882324,705.882324,,,,,,,2.142578,1.09082,0.011108,1.21637,0.023712,1.115005,0.012603,6193.0,40075,21400,1203.0,5192,165,29924,38719,518484,526.0,15440.0,16106.0,20,21152.0,293955.0,258,17549.0,12800,418644,2007,2,0
2,vfTADBw3uqyLukTz5juO0g==,217,商交,主全,waWCEYZJqj9PYxFdVeVLkpCNf/n0BdXPFi1iHlk0WWk=,1454.0,0.600098,132800,6,0,2007-01-01 00:00:00,长城,长城汽车,长城CC6460KM60旅行车,哈弗,2.771484,5,非营业,非过户投保,434.75,26560.0,251.375,1000000,0.0,0.0,0.0,0,0.0,0,30.0,350.0,0.0,0.0,0.0,0.0,1695000.0,0.0,0.0,0.0,0.0,,jh4mxXNEalwumcCWUJdnBw==,55.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,154.625,2,否,是,否,否,0.0,0.0,9.0,17.0,否,否,2,0.0,0.0,,16.0,1.0,0,长城 旅行车,长城,旅行车,否,jh4mxXNEalwumcCWUJdnBw==_eNP+WqbTmmD3bj49nIcSe...,商交_主全_非过户投保_否_否,4143,418644,5144.0,0.010949,,858.852061,0.016369,0.000251,,,,65.116259,0.016117,686.0,0.633789,0.366455,0.267334,1.0,336.0,1.959961,-768.0,0.471924,0.005166,4.4375,350.0,0.0,0.0,inf,,,,,4842.856934,,,,,,,,,,,0.0,-0.099976,0.0,-0.046575,2.146535,0.0534,6193.0,25550,25842,20960.0,981,3,3337,11440,6702,932.0,10131.0,2120.0,936,18735.0,293955.0,64,130634.0,4143,418644,2007,1,0
3,zP5cmQ2nwzLbvocQPmf2YA==,217,商交,主全,nyRm/VviYGDpy2errRWE206SaYkVuqeclusAtXEU9v8=,3526.0,0.850098,316800,5,2,2015-02-09 00:00:00,奥迪,一汽大众,奥迪FV7201BACBG轿车,A6,1.984375,5,非营业,非过户投保,1651.0,202752.0,381.0,1000000,141.625,202752.0,17.921875,30000,45.46875,30000,27.0,9.898438,19.90625,0.0,170.0,0.0,1000000.0,1000000.0,0.0,120000.0,0.0,,jh4mxXNEalwumcCWUJdnBw==,47.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,208.0,2,是,是,是,是,0.0,0.0,1.0,7.0,否,否,7,0.0,0.0,151786.0,0.0,1.0,0,奥迪 轿车,奥迪,轿车,否,jh4mxXNEalwumcCWUJdnBw==_eNP+WqbTmmD3bj49nIcSe...,商交_主全_非过户投保_否_否,6513,418644,2183.0,0.01113,,1523.076923,0.008143,0.000381,0.000699,0.000597,0.001516,21.37258,0.007762,2238.0,0.737793,0.170288,0.567383,0.907715,2228.0,226.125,-1288.0,0.634766,0.007064,10.757812,-10.007812,-150.125,170.0,0.497314,0.117065,inf,4.25,0.0,101026.046875,50235.480469,705.882324,,50790.566406,2.01105,49529.597656,71.166931,0.028258,1260.96875,,0.0,-0.011108,0.0,-0.019178,0.579224,-0.00807,6193.0,12178,53772,3437.0,15475,264,8255,12178,518484,27.0,21338.0,345003.0,20,85566.0,52519.0,188,17549.0,6513,418644,2015,2,0
4,+ruD5NLealUAfMZPQd6LEw==,217,单交,单交,LacSDMaoqD0AJRqCeYaGUu343r4NQiVuiFc9hyjLcMI=,522.5,,247800,3,0,2017-12-12 00:00:00,大众,上汽大众,大众汽车SVW6474CED多用途乘用车,途观,1.797852,5,非营业,非过户投保,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,yUh7960km3oydK6Km9rqRA==,52.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,否,是,0.0,2,是,否,是,是,0.0,1.0,11.0,30.0,否,否,11,0.0,0.0,,1.0,1.0,0,大众汽车 多用途乘用车,大众汽车,多用途乘用车,否,yUh7960km3oydK6Km9rqRA==_eNP+WqbTmmD3bj49nIcSe...,单交_单交_非过户投保_否_否,8621,76268,1146.0,0.002109,,inf,,,,,,,,0.0,,,,,0.0,,-522.5,0.0,0.0,,0.0,0.0,0.0,,,,,,,,,,,,,,,,0.0,0.272705,-0.088867,0.405457,-0.048867,1.486674,0.040001,6193.0,96796,10607,7750.0,38063,523,9571,38524,158001,,15440.0,8673.0,96729,216811.0,293955.0,137,55995.0,8621,76268,2017,12,1


In [20]:
# groupby feature
def group_fea(df,key,target):
    tmp = df.groupby(key, as_index=False)[target].agg({
        key+target + '_nunique': 'nunique',
    }).reset_index()
    del tmp['index']
    print("**************************{}**************************".format(target))
    return tmp

feature_key = ['dpt']
feature_target = ['trademark_cn', 'suiche_nonauto_amount_20','service_score_available']
for key in tqdm(feature_key):
    for target in feature_target:
        tmp = group_fea(df_feature,key,target)
        df_feature = df_feature.merge(tmp,on=key,how='left')
        
# tmp = group_fea(df,'spread_app_id','task_id')
# df = df.merge(tmp,on='spread_app_id',how='left')
del tmp
gc.collect()

  0%|          | 0/1 [00:00<?, ?it/s]

**************************trademark_cn**************************
**************************suiche_nonauto_amount_20**************************
**************************service_score_available**************************


100%|██████████| 1/1 [00:07<00:00,  7.96s/it]


107

In [21]:
# dpt 统计数值类特征的均值
num_fea_list = ['service_score_available', 'dur_personal_insurance_90']
num_fea_list += ['active_365','ncd_ly', 'bi_renewal_year', 'clmnum', 'seats']
num_fea_list += ['regdays', 'od_ratio', 'p2_client_grade', 'od_tp_ratio_2ord', 'nprem_lyratio']
num_fea_list += ['bi_renewal_year', 'nprem_lyratio', 'si_tp', 'suiche_nonauto_nprem_20_ydiff']
for col in num_fea_list:
    df_feature[f'dpt{col}_mean'] = df_feature.groupby('dpt')[col].transform('mean')

del num_fea_list
# personas,  统计数值类特征的均值
num_fea_list = ['active_365', 'service_score_available', 'dur_personal_insurance_90', 'f2_cust_housing_price_total', 'suiche_nonauto_nprem_20_ydiff']
num_fea_list += ['regdays', 'tp_ratio', 'od_ratio', 'p2_client_grade', 'active_90_365_ratio', 'nprem_ly', 'ncd_ly', 'si_tp', 'suiche_nonauto_nprem_20_ydiff']
num_fea_list += ['od_tp_ratio_2ord', 'nprem_lyratio', 'newvalue', 'bi_renewal_year', 'clmnum', 'capab', 'seats', 'p2_client_grade']
for col in num_fea_list:
    df_feature['personas_{}_mean'.format(col)] = df_feature.groupby('personas')[col].transform('mean')

In [22]:
# feature = pd.DataFrame()
# to_group = [['suiche_nonauto_nprem_20'], ['suiche_nonauto_amount_20']]
# to_inter = ['p1_prior_days_to_insure']
# to_calc = [
#     'mean',
#     'std',
#     'nunique',
# ]

# for i in tqdm(to_group):
#     for j in to_inter:
#         for k in to_calc:
#             feature["STAT_{}_{}_{}".format("_".join(i),j,k)] = df_feature[i + [j]].groupby(i)[j].transform(k)
            
# print(feature.shape)
# feature.reset_index(drop=True, inplace=True)
# df_feature[feature.columns] = feature
# del feature
# gc.collect()
# df_feature.shape

In [23]:
for agg in ['mean']:
    df_feature[f'dpt_ncd_ly_p1_prior_days_to_insure_2ord{agg}'] = df_feature.groupby(['dpt', 'ncd_ly'])['p1_prior_days_to_insure'].transform(agg)
    df_feature[f'dpt_ncd_ly_active_90_365_ratio_2ord{agg}'] = df_feature.groupby(['dpt', 'ncd_ly'])['active_90_365_ratio'].transform(agg)
    df_feature[f'dpt_ncd_ly_tp_ratio_2ord{agg}'] = df_feature.groupby(['dpt', 'ncd_ly'])['tp_ratio'].transform(agg)

    df_feature[f'dpt_p1_census_register_active_90_365_ratio_2ord{agg}'] = df_feature.groupby(['dpt', 'p1_census_register'])['active_90_365_ratio'].transform(agg)
    df_feature[f'dpt_p1_census_register_p1_prior_days_to_insure_2ord{agg}'] = df_feature.groupby(['dpt', 'p1_census_register'])['p1_prior_days_to_insure'].transform(agg)
    df_feature[f'dpt_p1_census_register_tp_ratio_2ord{agg}'] = df_feature.groupby(['dpt', 'p1_census_register'])['tp_ratio'].transform(agg)
    df_feature[f'dpt_p1_census_register_regdays_2ord{agg}'] = df_feature.groupby(['dpt', 'p1_census_register'])['regdays'].transform(agg)
    df_feature[f'dpt_p1_census_register_ncd_ly_2ord{agg}'] = df_feature.groupby(['dpt', 'p1_census_register'])['ncd_ly'].transform(agg)

    df_feature[f'dpt_p2_client_grade_active_90_365_ratio_2ord{agg}'] = df_feature.groupby(['dpt', 'p2_client_grade'])['active_90_365_ratio'].transform(agg)
    df_feature[f'dpt_p2_client_grade_p1_prior_days_to_insure_2ord{agg}'] = df_feature.groupby(['dpt', 'p2_client_grade'])['p1_prior_days_to_insure'].transform(agg)
    df_feature[f'dpt_p2_client_grade_tp_ratio_2ord{agg}'] = df_feature.groupby(['dpt', 'p2_client_grade'])['tp_ratio'].transform(agg)
    df_feature[f'dpt_p2_client_grade_regdays_2ord{agg}'] = df_feature.groupby(['dpt', 'p2_client_grade'])['regdays'].transform(agg)
    df_feature[f'dpt_p2_client_grade_ncd_ly_2ord{agg}'] = df_feature.groupby(['dpt', 'p2_client_grade'])['ncd_ly'].transform(agg)

In [25]:
# 5折交叉 目标编码
df_train = df_feature[df_feature['flag']==0]
df_train = df_train.reset_index(drop=True)
# df_test_a = df_feature[df_feature['flag']==1]
df_test = df_feature[df_feature['flag']==2]
print(df_train.shape, df_test.shape)

def n_fold_target_encoding(train_df,test_df,label='label',n=5,enc_list=[],functions=['mean']):
    skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=seed)
    for f in tqdm(enc_list):
        for func in functions:
            train_df[f + f'_target_enc_{func}'] = 0
            test_df[f + f'_target_enc_{func}'] = 0
            for i, (trn_idx, val_idx) in enumerate(skf.split(train_df, train_df[label])):
                trn_x = train_df[[f, label]].iloc[trn_idx].reset_index(drop=True)
                val_x = train_df[[f]].iloc[val_idx].reset_index(drop=True)
                enc_df = trn_x.groupby(f, as_index=False)[label].agg({f + f'_target_enc_{func}': func})
                val_x = val_x.merge(enc_df, on=f, how='left')
                test_x = test_df[[f]].merge(enc_df, on=f, how='left')
                val_x[f + f'_target_enc_{func}'] = val_x[f + f'_target_enc_{func}'].fillna(train_df[label].agg(func))
                test_x[f + f'_target_enc_{func}'] = test_x[f + f'_target_enc_{func}'].fillna(train_df[label].agg(func))
                train_df.loc[val_idx, f + f'_target_enc_{func}'] = val_x[f + f'_target_enc_{func}'].values
                test_df[f + f'_target_enc_{func}'] += test_x[f + f'_target_enc_{func}'].values / skf.n_splits
                del(trn_x)
                del(val_x)
                del(enc_df)
                gc.collect()
    return train_df,test_df
target_enc_list = ['dpt', 'make_cn_cartype', 'trademark_cn', 'brand_cn', 'series', 'capab', 'ncd_ly', 'nprem_ly', 'make_cn_count']
target_enc_list += ['tp_ratio', 'p1_census_register', 'p2_client_grade', 'active_90_365_ratio', 'active_30_365_ratio', 'service_score_available']
target_enc_list += ['suiche_nonauto_nprem_19_ratio', 'suiche_nonauto_nprem_20_ratio', 'active_90_365_diff']
target_enc_list += ['nprem_lyratio', 'nprem_20ratio', 'regdays', 'p1_prior_days_to_insure']
target_enc_list += ['make_cn_count','make_cn_cartype_count','make_cn_cartype_0_count', 'personas_count']

# target_enc_list += ['suiche_nonauto_nprem_20_19_ratio_2diff', 'suiche_nonauto_nprem_20_ydiff', 'suiche_nonauto_nprem_20_yratio', 'suiche_nonauto_nprem_20_19_ratio_2ord']
target_enc_list += ['newvalue', 'bi_renewal_year', 'clmnum', 'si_tp', 'active_30_365_diff', 'od_tp_ratio_2ord']
target_enc_list += ['active_365', 'f2_cust_housing_price_total']
target_enc_list += ['suiche_nonauto_amount_20_count', 'suiche_nonauto_amount_19_count', 'npremtot_car_value_ratio']
# target_enc_list += ['suiche_nonauto_nprem_19_2yratio', 'suiche_nonauto_nprem_17_ratio']


df_train, df_test = n_fold_target_encoding(df_train,df_test,label='y1_is_purchase',n=10,enc_list=target_enc_list,functions=['mean'])
df_feature = pd.concat([df_train, df_test], axis=0)

  0%|          | 0/37 [00:00<?, ?it/s]

(684283, 193) (80110, 193)


100%|██████████| 37/37 [04:09<00:00,  6.75s/it]


In [26]:
print(df_feature.shape)
df_feature = reduce_mem(df_feature)
df_feature.head()

(764393, 229)
-- Mem. usage decreased to 592.66 Mb (36.4% reduction),time spend:0.05 min


Unnamed: 0,client_no,dpt,xz,xb,carid,nprem_ly,ncd_ly,newvalue,bi_renewal_year,clmnum,regdate,trademark_cn,brand_cn,make_cn,series,capab,seats,use_type,change_owner,nprem_od,si_od,nprem_tp,si_tp,nprem_bt,si_bt,nprem_vld,si_vld,nprem_vlp,si_vlp,p1_prior_days_to_insure,suiche_nonauto_nprem_20,suiche_nonauto_nprem_19,suiche_nonauto_nprem_18,suiche_nonauto_nprem_17,suiche_nonauto_nprem_16,suiche_nonauto_amount_20,suiche_nonauto_amount_19,suiche_nonauto_amount_18,suiche_nonauto_amount_17,suiche_nonauto_amount_16,num_notcar_claim,p1_gender,p1_age,p1_census_register,p2_marital_status,f1_child_flag,f2_posses_house_flag,f2_cust_housing_price_total,p2_client_grade,w1_pc_wx_use_flag,p1_is_bank_eff,p2_is_enterprise_owner,p2_is_smeowner,active_7,active_30,active_90,active_365,p2_is_child_under_15_family,p2_is_adult_over_55_family,birth_month,p1_service_offer_cnt,p3_service_use_cnt,dur_personal_insurance_90,service_score_available,y1_is_purchase,flag,make_cn_cartype,make_cn_cartype_0,make_cn_cartype_1,p2_is_child_old_family,personas,xz_xb_co_15_55,personas_cno_count,xz_xb_co_15_55_cno_count,regdays,npremly_car_value_ratio,p3_service_use_ratio,car_housing_value_ratio,od_ratio,tp_ratio,bt_ratio,vld_ratio,vlp_ratio,od_tp_ratio_2ord,od_tp_ratio_2diff,nprem_tot,nprem_od_percent,nprem_tp_percent,nprem_odtp_percent_diff,nprem_odtp_percent_add,nprem_20diff,nprem_20ratio,nprem_lydiff,nprem_lyratio,npremtot_car_value_ratio,npremtot_housing_value_ratio,suiche_nonauto_nprem_20_ydiff,suiche_nonauto_nprem_19_2ydiff,suiche_nonauto_nprem_17_ydiff,suiche_nonauto_nprem_20_yratio,suiche_nonauto_nprem_19_2yratio,suiche_nonauto_nprem_17_yratio,suiche_nonauto_nprem_20_19_yratio_2ord,suiche_nonauto_nprem_20_17_yratio_2ord,suiche_nonauto_nprem_20_ratio,suiche_nonauto_nprem_19_ratio,suiche_nonauto_nprem_17_ratio,suiche_nonauto_nprem_16_ratio,suiche_nonauto_nprem_20_19_ratio_2diff,suiche_nonauto_nprem_20_19_ratio_2ord,suiche_nonauto_nprem_19_17_ratio_2diff,suiche_nonauto_nprem_19_17_ratio_2ord,suiche_nonauto_nprem_20_17_ratio_3ord,suiche_nonauto_nprem_20_17_ratio_3diff,active_7_30_ratio,active_30_90_ratio,active_30_90_diff,active_30_365_ratio,active_30_365_diff,active_90_365_ratio,active_90_365_diff,p1_census_register_count,trademark_cn_count,brand_cn_count,series_count,capab_count,make_cn_count,make_cn_cartype_count,make_cn_cartype_0_count,make_cn_cartype_1_count,tp_ratio_count,p1_age_count,service_score_available_count,nprem_tp_count,suiche_nonauto_amount_20_count,suiche_nonauto_amount_19_count,nprem_ly_count,p1_prior_days_to_insure_count,personas_count,xz_xb_co_15_55_count,reg_year,reg_month,reg_day,dpttrademark_cn_nunique,dptsuiche_nonauto_amount_20_nunique,dptservice_score_available_nunique,dptservice_score_available_mean,dptdur_personal_insurance_90_mean,dptactive_365_mean,dptncd_ly_mean,dptbi_renewal_year_mean,dptclmnum_mean,dptseats_mean,dptregdays_mean,dptod_ratio_mean,dptp2_client_grade_mean,dptod_tp_ratio_2ord_mean,dptnprem_lyratio_mean,dptsi_tp_mean,dptsuiche_nonauto_nprem_20_ydiff_mean,personas_active_365_mean,personas_service_score_available_mean,personas_dur_personal_insurance_90_mean,personas_f2_cust_housing_price_total_mean,personas_suiche_nonauto_nprem_20_ydiff_mean,personas_regdays_mean,personas_tp_ratio_mean,personas_od_ratio_mean,personas_p2_client_grade_mean,personas_active_90_365_ratio_mean,personas_nprem_ly_mean,personas_ncd_ly_mean,personas_si_tp_mean,personas_od_tp_ratio_2ord_mean,personas_nprem_lyratio_mean,personas_newvalue_mean,personas_bi_renewal_year_mean,personas_clmnum_mean,personas_capab_mean,personas_seats_mean,dpt_ncd_ly_p1_prior_days_to_insure_2ordmean,dpt_ncd_ly_active_90_365_ratio_2ordmean,dpt_ncd_ly_tp_ratio_2ordmean,dpt_p1_census_register_active_90_365_ratio_2ordmean,dpt_p1_census_register_p1_prior_days_to_insure_2ordmean,dpt_p1_census_register_tp_ratio_2ordmean,dpt_p1_census_register_regdays_2ordmean,dpt_p1_census_register_ncd_ly_2ordmean,dpt_p2_client_grade_active_90_365_ratio_2ordmean,dpt_p2_client_grade_p1_prior_days_to_insure_2ordmean,dpt_p2_client_grade_tp_ratio_2ordmean,dpt_p2_client_grade_regdays_2ordmean,dpt_p2_client_grade_ncd_ly_2ordmean,dpt_target_enc_mean,make_cn_cartype_target_enc_mean,trademark_cn_target_enc_mean,brand_cn_target_enc_mean,series_target_enc_mean,capab_target_enc_mean,ncd_ly_target_enc_mean,nprem_ly_target_enc_mean,make_cn_count_target_enc_mean,tp_ratio_target_enc_mean,p1_census_register_target_enc_mean,p2_client_grade_target_enc_mean,active_90_365_ratio_target_enc_mean,active_30_365_ratio_target_enc_mean,service_score_available_target_enc_mean,suiche_nonauto_nprem_19_ratio_target_enc_mean,suiche_nonauto_nprem_20_ratio_target_enc_mean,active_90_365_diff_target_enc_mean,nprem_lyratio_target_enc_mean,nprem_20ratio_target_enc_mean,regdays_target_enc_mean,p1_prior_days_to_insure_target_enc_mean,make_cn_cartype_count_target_enc_mean,make_cn_cartype_0_count_target_enc_mean,personas_count_target_enc_mean,newvalue_target_enc_mean,bi_renewal_year_target_enc_mean,clmnum_target_enc_mean,si_tp_target_enc_mean,active_30_365_diff_target_enc_mean,od_tp_ratio_2ord_target_enc_mean,active_365_target_enc_mean,f2_cust_housing_price_total_target_enc_mean,suiche_nonauto_amount_20_count_target_enc_mean,suiche_nonauto_amount_19_count_target_enc_mean,npremtot_car_value_ratio_target_enc_mean
0,5gDljzjQ61m/QeU2tZBgDA==,217,商交,主全,WY4N+MOjfIx8wJ3j6GhlA4qEfL71brEUkqbB0SSdqkI=,1391.0,0.600098,88900,6,0,2010-02-09 00:00:00,福特,长安福特马自达,福特CAF7152A轿车,嘉年华,1.498047,5,非营业,非过户投保,379.0,24892.0,239.25,1000000,0.0,0.0,3.75,10000,9.507812,10000,30.0,,,,,,,,,,,,jh4mxXNEalwumcCWUJdnBw==,56.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,0.0,2,是,否,,,0.0,0.0,3.0,32.0,否,是,6,0.0,0.0,,0.0,0.0,0,福特 轿车,福特,轿车,是,jh4mxXNEalwumcCWUJdnBw==_eNP+WqbTmmD3bj49nIcSe...,商交_主全_非过户投保_否_是,12800,50781,4008.0,0.015647,,inf,0.015228,0.000239,,0.000375,0.000951,63.625,0.014984,631.5,0.600098,0.378906,0.221191,0.978516,,,-759.5,0.454102,0.007103,inf,,,,,,,,,,,,,,,,,,,,0.0,-0.033325,0.0,-0.087646,0.380127,-0.054352,6192.0,29781,15325,2014.0,45294,60,7207,8583,518484,361.0,9336.0,345003.0,363,,,123,130634.0,12800,50781,2010,2,0,113,243,3120,764.5,69417.9375,33.78125,0.744629,3.25,0.143433,5.289062,2426.0,0.009033,2.943359,26.296875,0.375,808389.375,38.625,32.6875,1053.0,122923.820312,90.5625,53.375,2834.0,0.000983,0.01709,2.53125,1.774414,2610.0,0.701172,907765.625,20.65625,0.536133,139255.359375,3.775391,0.130859,1.727539,5.316406,21.921875,1.517578,0.000282,1.567383,20.109375,0.000349,2610.0,0.745117,1.519531,21.109375,0.00034,2720.0,0.706055,0.468506,0.650879,0.644043,0.611328,0.594238,0.641113,0.704102,0.525879,0.611328,0.653809,0.447266,0.670898,0.592285,0.590332,0.61377,,,0.590332,0.447754,,0.587891,0.742188,0.650879,0.65625,0.661133,0.640625,0.619141,0.624512,0.703125,0.578125,,0.649902,0.631836,,,
1,qTsiFUfrw8gwVOM+LftPvA==,217,商交,主全,DXMuODygH0ddFea7SIoAOhF4134Bx4TPvkkPi6WCdzU=,3740.0,1.0,303000,6,0,2007-02-02 00:00:00,丰田,天津丰田,丰田TV7250RoyalA轿车,皇冠,2.496094,5,非营业,非过户投保,1993.0,60600.0,505.5,1000000,72.375,60600.0,0.0,0,0.0,0,27.0,390.0,0.0,0.0,170.0,170.0,2795000.0,0.0,0.0,120000.0,120000.0,,jh4mxXNEalwumcCWUJdnBw==,52.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,0.0,2,是,否,是,是,2.0,4.0,11.0,40.0,否,否,9,0.0,0.0,,10.0,1.0,0,丰田 轿车,丰田,轿车,否,jh4mxXNEalwumcCWUJdnBw==_eNP+WqbTmmD3bj49nIcSe...,商交_主全_非过户投保_否_否,12800,418644,5112.0,0.012343,,inf,0.032898,0.000505,0.001194,,,65.0625,0.032379,2570.0,0.775391,0.196655,0.578613,0.972168,2180.0,6.589844,-1170.0,0.687012,0.008484,inf,390.0,-170.0,0.0,inf,0.0,1.0,inf,inf,7166.666504,,705.882324,705.882324,,,,,,,2.142578,1.09082,0.011108,1.216797,0.023712,1.115234,0.012604,6192.0,40075,21400,1203.0,5192,165,29924,38719,518484,526.0,15440.0,16106.0,20,21152.0,293955.0,258,17549.0,12800,418644,2007,2,0,113,243,3120,764.5,69417.9375,33.78125,0.744629,3.25,0.143433,5.289062,2426.0,0.009033,2.943359,26.296875,0.375,808389.375,38.625,32.6875,1053.0,122923.820312,90.5625,53.375,2834.0,0.000983,0.01709,2.53125,1.774414,2610.0,0.701172,907765.625,20.65625,0.536133,139255.359375,3.775391,0.130859,1.727539,5.316406,17.71875,1.517578,0.000494,1.567383,20.109375,0.000349,2610.0,0.745117,1.519531,21.109375,0.00034,2720.0,0.706055,0.470215,0.63916,0.644531,0.639648,0.637207,0.605469,0.63623,0.73291,0.627441,0.63623,0.44751,0.671387,0.648926,0.667969,0.626465,,0.857422,0.668457,0.709473,0.816406,0.520996,0.662598,0.63916,0.64502,0.661621,0.626953,0.619141,0.624512,0.703613,0.645508,,0.632324,0.631836,0.860352,0.701172,
2,vfTADBw3uqyLukTz5juO0g==,217,商交,主全,waWCEYZJqj9PYxFdVeVLkpCNf/n0BdXPFi1iHlk0WWk=,1454.0,0.600098,132800,6,0,2007-01-01 00:00:00,长城,长城汽车,长城CC6460KM60旅行车,哈弗,2.771484,5,非营业,非过户投保,434.75,26560.0,251.375,1000000,0.0,0.0,0.0,0,0.0,0,30.0,350.0,0.0,0.0,0.0,0.0,1695000.0,0.0,0.0,0.0,0.0,,jh4mxXNEalwumcCWUJdnBw==,55.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,154.625,2,否,是,否,否,0.0,0.0,9.0,17.0,否,否,2,0.0,0.0,,16.0,1.0,0,长城 旅行车,长城,旅行车,否,jh4mxXNEalwumcCWUJdnBw==_eNP+WqbTmmD3bj49nIcSe...,商交_主全_非过户投保_否_否,4143,418644,5144.0,0.010949,,858.852061,0.016373,0.000251,,,,65.125,0.016113,686.0,0.633789,0.366455,0.267334,1.0,336.0,1.959961,-768.0,0.471924,0.005165,4.4375,350.0,0.0,0.0,inf,,,,,4842.856934,,,,,,,,,,,0.0,-0.099976,0.0,-0.04657,2.146484,0.053406,6192.0,25550,25842,20960.0,981,3,3337,11440,6702,932.0,10128.0,2120.0,936,18735.0,293955.0,64,130634.0,4143,418644,2007,1,0,113,243,3120,764.5,69417.9375,33.78125,0.744629,3.25,0.143433,5.289062,2426.0,0.009033,2.943359,26.296875,0.375,808389.375,38.625,31.375,1187.0,120779.351562,83.3125,54.75,2824.0,0.000998,0.0168,2.519531,1.716797,2868.0,0.737305,908484.1875,19.953125,0.534668,166271.65625,3.625,0.186279,1.796875,5.332031,21.921875,1.517578,0.000282,1.567383,20.109375,0.000349,2610.0,0.745117,1.519531,21.109375,0.00034,2720.0,0.706055,0.468994,0.654297,0.682617,0.681152,0.697266,0.587402,0.70459,0.526367,0.538574,0.592285,0.453125,0.670898,0.678223,0.589844,0.657227,,0.839355,0.677734,0.518555,0.641113,0.394775,0.742676,0.654297,0.644043,0.606445,0.620117,0.619141,0.624512,0.703125,0.604492,,0.650879,0.666504,0.838379,0.701172,
3,zP5cmQ2nwzLbvocQPmf2YA==,217,商交,主全,nyRm/VviYGDpy2errRWE206SaYkVuqeclusAtXEU9v8=,3526.0,0.850098,316800,5,2,2015-02-09 00:00:00,奥迪,一汽大众,奥迪FV7201BACBG轿车,A6,1.984375,5,非营业,非过户投保,1651.0,202752.0,381.0,1000000,141.625,202752.0,17.921875,30000,45.46875,30000,27.0,9.898438,19.90625,0.0,170.0,0.0,1000000.0,1000000.0,0.0,120000.0,0.0,,jh4mxXNEalwumcCWUJdnBw==,47.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,208.0,2,是,是,是,是,0.0,0.0,1.0,7.0,否,否,7,0.0,0.0,151786.0,0.0,1.0,0,奥迪 轿车,奥迪,轿车,否,jh4mxXNEalwumcCWUJdnBw==_eNP+WqbTmmD3bj49nIcSe...,商交_主全_非过户投保_否_否,6513,418644,2184.0,0.01113,,1523.076923,0.008141,0.000381,0.000699,0.000597,0.001515,21.375,0.007763,2238.0,0.737793,0.170288,0.567383,0.907715,2228.0,226.125,-1288.0,0.634766,0.007065,10.757812,-10.007812,-150.125,170.0,0.497314,0.117065,inf,4.25,0.0,101026.046875,50235.480469,705.882324,,50790.566406,2.01105,49529.597656,71.166931,0.028258,1260.96875,,0.0,-0.011108,0.0,-0.01918,0.579102,-0.008072,6192.0,12178,53772,3436.0,15475,264,8255,12178,518484,27.0,21344.0,345003.0,20,85566.0,52519.0,188,17549.0,6513,418644,2015,2,0,113,243,3120,764.5,69417.9375,33.78125,0.744629,3.25,0.143433,5.289062,2426.0,0.009033,2.943359,26.296875,0.375,808389.375,38.625,38.84375,1597.0,143701.59375,87.0,71.375,2848.0,0.000977,0.016983,2.408203,1.668945,2852.0,0.723145,932097.375,20.28125,0.545898,161642.203125,3.716797,0.152588,1.791016,5.371094,18.296875,1.554688,0.000406,1.567383,20.109375,0.000349,2610.0,0.745117,1.519531,21.109375,0.00034,2720.0,0.706055,0.470215,0.585938,0.594727,0.652832,0.594238,0.620117,0.67334,0.771973,0.67627,0.235352,0.44751,0.671387,0.577637,0.589844,0.61377,0.521973,0.417236,0.568359,0.739746,0.5,0.69043,0.662598,0.585938,0.594727,0.637695,0.654785,0.652344,0.649414,0.703613,0.592773,,0.633301,0.657227,0.419189,0.5,
4,+ruD5NLealUAfMZPQd6LEw==,217,单交,单交,LacSDMaoqD0AJRqCeYaGUu343r4NQiVuiFc9hyjLcMI=,522.5,,247800,3,0,2017-12-12 00:00:00,大众,上汽大众,大众汽车SVW6474CED多用途乘用车,途观,1.797852,5,非营业,非过户投保,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,yUh7960km3oydK6Km9rqRA==,52.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,否,是,0.0,2,是,否,是,是,0.0,1.0,11.0,30.0,否,否,11,0.0,0.0,,1.0,1.0,0,大众汽车 多用途乘用车,大众汽车,多用途乘用车,否,yUh7960km3oydK6Km9rqRA==_eNP+WqbTmmD3bj49nIcSe...,单交_单交_非过户投保_否_否,8621,76268,1146.0,0.002109,,inf,,,,,,,,0.0,,,,,0.0,,-522.5,0.0,0.0,,0.0,0.0,0.0,,,,,,,,,,,,,,,,0.0,0.272705,-0.088867,0.405518,-0.048859,1.486328,0.040009,6192.0,96796,10607,7752.0,38063,523,9571,38524,158001,,15440.0,8673.0,96729,216811.0,293955.0,137,55995.0,8621,76268,2017,12,1,113,243,3120,764.5,69417.9375,33.78125,0.744629,3.25,0.143433,5.289062,2426.0,0.009033,2.943359,26.296875,0.375,808389.375,38.625,28.453125,773.0,89221.382812,122.5,43.53125,2806.0,0.001058,0.017197,2.607422,1.835938,2732.0,0.70752,872004.4375,19.1875,0.54834,150723.390625,3.742188,0.138428,1.719727,5.132812,,,,1.567383,20.109375,0.000349,2610.0,0.745117,1.519531,21.109375,0.00034,2720.0,0.706055,0.467773,0.708008,0.663086,0.68457,0.71875,0.669434,,0.394531,0.729492,,0.446045,0.671387,0.637695,0.625488,0.648438,,,0.640137,0.265381,,0.34082,0.437988,0.708008,0.692383,0.656738,0.700195,0.646973,0.624512,0.264893,0.630859,,0.63916,0.632324,0.71582,0.701172,0.265625


## 模型训练

In [27]:
# for f in list(df_feature.select_dtypes('object')):
#     if f in ['carid', 'regdate']:
#         continue
#     le = LabelEncoder()
#     df_feature[f] = le.fit_transform(
#         df_feature[f].astype('str')).astype('int')

In [28]:
#筛选特征
drop_fea = ['y1_is_purchase', 'regdate', 'carid', 'use_type', 'suiche_nonauto_nprem_18', 'suiche_nonauto_amount_18', 'client_no', 'num_notcar_claim', 'flag']
drop_fea += ['xz', 'xb', 'trademark_cn', 'brand_cn', 'make_cn', 'series', 'change_owner', 'p1_gender', 'p1_census_register', 'p2_marital_status', 'f1_child_flag', 'f2_posses_house_flag', 'w1_pc_wx_use_flag', 'p1_is_bank_eff', 'p2_is_enterprise_owner', 'p2_is_smeowner', 'p2_is_child_under_15_family', 'p2_is_adult_over_55_family', 'make_cn_cartype', 'make_cn_cartype_0', 'make_cn_cartype_1', 'p2_is_child_old_family', 'personas', 'xz_xb_co_15_55']
feature= [x for x in df_feature.columns if x not in drop_fea]
print(len(feature))

196


In [29]:
# cat_features = df_feature.columns[np.where(df_feature.dtypes == np.object)[0]].values.tolist()
# cat_features = [x for x in cat_features if x not in drop_fea]
# print(cat_features)
# df_feature[cat_features] = df_feature[cat_features].astype(str)

In [30]:
train_df = df_feature[df_feature['flag']==0]
test_df = df_feature[df_feature['flag']==2]
train_df = reduce_mem(df_train)
test_df = reduce_mem(df_test)

-- Mem. usage decreased to 525.33 Mb (36.6% reduction),time spend:0.12 min
-- Mem. usage decreased to 61.96 Mb (36.5% reduction),time spend:0.01 min


In [31]:
#   五折交叉
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
oof =  np.zeros(len(train_df)) 
predictions =np.zeros(len(test_df))
y_train = train_df['y1_is_purchase']
for i, (trn, val) in enumerate(folds.split(train_df.values,y_train.values)):
    print("Fold:",i+1)
    
    trn_x = train_df.iloc[trn][feature]
    trn_y = y_train.iloc[trn]
    val_x = train_df.iloc[val][feature]
    val_y = y_train.iloc[val]
    
    clf = CatBoostClassifier(iterations=4000, depth=8, learning_rate=0.05, l2_leaf_reg=50, loss_function='Logloss'
                        ,verbose=True,eval_metric='AUC',counter_calc_method='Full',task_type='CPU',devices='0-3',metric_period=50)
    clf.fit(
    trn_x[feature], trn_y.astype('int32'),
    eval_set=[(val_x[feature],val_y.astype('int32'))],
    early_stopping_rounds=50,
    # cat_features=cat_features,
    verbose=True,
    use_best_model=True)
    oof[val] = clf.predict_proba(train_df.iloc[val][feature])[:, 1]
    predictions += clf.predict_proba(test_df[feature])[:, 1] / folds.n_splits
    del trn_x
    del trn_y
    del val_x
    del val_y
    gc.collect()

Fold: 1




0:	test: 0.8596638	best: 0.8596638 (0)	total: 482ms	remaining: 32m 6s
50:	test: 0.8842940	best: 0.8842940 (50)	total: 20.1s	remaining: 25m 54s
100:	test: 0.8905904	best: 0.8905904 (100)	total: 39.5s	remaining: 25m 26s
150:	test: 0.8934273	best: 0.8934273 (150)	total: 58.3s	remaining: 24m 47s
200:	test: 0.8949397	best: 0.8949397 (200)	total: 1m 16s	remaining: 24m 12s
250:	test: 0.8959801	best: 0.8959801 (250)	total: 1m 35s	remaining: 23m 47s
300:	test: 0.8968862	best: 0.8968862 (300)	total: 1m 53s	remaining: 23m 11s
350:	test: 0.8978082	best: 0.8978082 (350)	total: 2m 11s	remaining: 22m 48s
400:	test: 0.8982968	best: 0.8982968 (400)	total: 2m 28s	remaining: 22m 12s
450:	test: 0.8986786	best: 0.8986786 (450)	total: 2m 44s	remaining: 21m 36s
500:	test: 0.8990815	best: 0.8990815 (500)	total: 3m 1s	remaining: 21m 7s
550:	test: 0.8993517	best: 0.8993517 (550)	total: 3m 17s	remaining: 20m 37s
600:	test: 0.8995539	best: 0.8995539 (600)	total: 3m 33s	remaining: 20m 8s
650:	test: 0.8997746	best:



0:	test: 0.8600589	best: 0.8600589 (0)	total: 406ms	remaining: 27m 2s
50:	test: 0.8839783	best: 0.8839783 (50)	total: 20.2s	remaining: 26m 2s
100:	test: 0.8900305	best: 0.8900305 (100)	total: 39.6s	remaining: 25m 28s
150:	test: 0.8929285	best: 0.8929285 (150)	total: 58.5s	remaining: 24m 51s
200:	test: 0.8945913	best: 0.8945913 (200)	total: 1m 17s	remaining: 24m 24s
250:	test: 0.8956271	best: 0.8956271 (250)	total: 1m 35s	remaining: 23m 53s
300:	test: 0.8966846	best: 0.8966846 (300)	total: 1m 53s	remaining: 23m 19s
350:	test: 0.8975920	best: 0.8975920 (350)	total: 2m 12s	remaining: 22m 57s
400:	test: 0.8981713	best: 0.8981713 (400)	total: 2m 29s	remaining: 22m 20s
450:	test: 0.8985795	best: 0.8985795 (450)	total: 2m 45s	remaining: 21m 43s
500:	test: 0.8988782	best: 0.8988782 (500)	total: 3m 1s	remaining: 21m 10s
550:	test: 0.8991788	best: 0.8991788 (550)	total: 3m 17s	remaining: 20m 38s
600:	test: 0.8994103	best: 0.8994103 (600)	total: 3m 34s	remaining: 20m 12s
650:	test: 0.8996147	best



0:	test: 0.8587300	best: 0.8587300 (0)	total: 376ms	remaining: 25m 4s
50:	test: 0.8840152	best: 0.8840152 (50)	total: 19.6s	remaining: 25m 15s
100:	test: 0.8898887	best: 0.8898887 (100)	total: 38.7s	remaining: 24m 52s
150:	test: 0.8926571	best: 0.8926571 (150)	total: 57s	remaining: 24m 11s
200:	test: 0.8943411	best: 0.8943411 (200)	total: 1m 14s	remaining: 23m 35s
250:	test: 0.8955715	best: 0.8955715 (250)	total: 1m 33s	remaining: 23m 9s
300:	test: 0.8965237	best: 0.8965237 (300)	total: 1m 51s	remaining: 22m 44s
350:	test: 0.8973484	best: 0.8973484 (350)	total: 2m 8s	remaining: 22m 18s
400:	test: 0.8979377	best: 0.8979377 (400)	total: 2m 26s	remaining: 21m 52s
450:	test: 0.8984024	best: 0.8984024 (450)	total: 2m 43s	remaining: 21m 23s
500:	test: 0.8986465	best: 0.8986465 (500)	total: 2m 58s	remaining: 20m 44s
550:	test: 0.8988131	best: 0.8988131 (550)	total: 3m 12s	remaining: 20m 5s
600:	test: 0.8990335	best: 0.8990335 (600)	total: 3m 28s	remaining: 19m 39s
650:	test: 0.8992831	best: 0



0:	test: 0.8612472	best: 0.8612472 (0)	total: 388ms	remaining: 25m 50s
50:	test: 0.8858136	best: 0.8858136 (50)	total: 20s	remaining: 25m 46s
100:	test: 0.8917082	best: 0.8917082 (100)	total: 39.6s	remaining: 25m 27s
150:	test: 0.8945008	best: 0.8945008 (150)	total: 58.1s	remaining: 24m 39s
200:	test: 0.8960048	best: 0.8960048 (200)	total: 1m 16s	remaining: 23m 59s
250:	test: 0.8970963	best: 0.8970963 (250)	total: 1m 34s	remaining: 23m 30s
300:	test: 0.8979393	best: 0.8979393 (300)	total: 1m 52s	remaining: 22m 58s
350:	test: 0.8986907	best: 0.8986907 (350)	total: 2m 9s	remaining: 22m 26s
400:	test: 0.8993557	best: 0.8993557 (400)	total: 2m 27s	remaining: 22m
450:	test: 0.8998238	best: 0.8998238 (450)	total: 2m 44s	remaining: 21m 34s
500:	test: 0.8998864	best: 0.8998864 (498)	total: 2m 57s	remaining: 20m 42s
550:	test: 0.9001296	best: 0.9001296 (550)	total: 3m 13s	remaining: 20m 8s
600:	test: 0.9004011	best: 0.9004011 (600)	total: 3m 29s	remaining: 19m 43s
650:	test: 0.9006252	best: 0.9



0:	test: 0.8607176	best: 0.8607176 (0)	total: 391ms	remaining: 26m 2s
50:	test: 0.8852448	best: 0.8852448 (50)	total: 19.6s	remaining: 25m 16s
100:	test: 0.8912897	best: 0.8912897 (100)	total: 38.6s	remaining: 24m 49s
150:	test: 0.8938547	best: 0.8938547 (150)	total: 56.8s	remaining: 24m 7s
200:	test: 0.8956777	best: 0.8956777 (200)	total: 1m 15s	remaining: 23m 40s
250:	test: 0.8966468	best: 0.8966468 (250)	total: 1m 32s	remaining: 23m 8s
300:	test: 0.8975767	best: 0.8975767 (300)	total: 1m 50s	remaining: 22m 36s
350:	test: 0.8983359	best: 0.8983359 (350)	total: 2m 7s	remaining: 22m 8s
400:	test: 0.8988699	best: 0.8988699 (400)	total: 2m 25s	remaining: 21m 42s
450:	test: 0.8993487	best: 0.8993487 (450)	total: 2m 42s	remaining: 21m 22s
500:	test: 0.8997113	best: 0.8997113 (500)	total: 3m	remaining: 20m 58s
550:	test: 0.8999931	best: 0.8999931 (550)	total: 3m 16s	remaining: 20m 29s
600:	test: 0.9002273	best: 0.9002273 (600)	total: 3m 32s	remaining: 19m 59s
650:	test: 0.9004657	best: 0.90

In [32]:
auc_score =roc_auc_score(y_train, oof)
print("AUC Score (Valid): %f" % auc_score)
print('开始储存')
res = pd.DataFrame()
res['carid'] = test_df['carid']
res['label'] = predictions

AUC Score (Valid): 0.902124
开始储存


In [33]:
os.makedirs('sub_catboost', exist_ok=True)
res.to_csv('sub_catboost/cat_seed_+'+str(seed)+'_test_{}.csv'.format(auc_score),index = False)
print(res.shape)
res.head(5)

(80110, 2)


Unnamed: 0,carid,label
684283,FbOikOdqe5f3mRYDAgnBH2PwI5I+egmzWyNwjmgAuWs=,0.000144
684284,WTO/cku1nHO592k9j56on2UzMmx8OLhw8peccj1m13I=,0.525896
684285,ow79MMeuFgFY92UOVjaECsaNPl5cRXAi3M5ZsB4Rt/s=,0.303149
684286,nuO8DDjdXKFMt5Of70LlXMlFoLDX0OMSSBYnNYnqTyQ=,0.903002
684287,j4gIDul5h/7IBEYq4y8oAr2+tSWj/NdsIFbGzDtpTsk=,0.517146


In [34]:
print('开始储存oof')
res_oof = pd.DataFrame()
res_oof['carid'] = train_df['carid']
res_oof['y1_is_purchase'] = train_df['y1_is_purchase']
res_oof['probability'] = oof
res_oof.to_csv('sub_catboost/cat_seed_+'+str(seed)+'_train_pred_{}.csv'.format(auc_score),index = False)
print(res_oof.shape)
res_oof.head()

开始储存oof
(684283, 3)


Unnamed: 0,carid,y1_is_purchase,probability
0,WY4N+MOjfIx8wJ3j6GhlA4qEfL71brEUkqbB0SSdqkI=,0.0,0.000121
1,DXMuODygH0ddFea7SIoAOhF4134Bx4TPvkkPi6WCdzU=,1.0,0.894851
2,waWCEYZJqj9PYxFdVeVLkpCNf/n0BdXPFi1iHlk0WWk=,1.0,0.808837
3,nyRm/VviYGDpy2errRWE206SaYkVuqeclusAtXEU9v8=,1.0,0.450337
4,LacSDMaoqD0AJRqCeYaGUu343r4NQiVuiFc9hyjLcMI=,1.0,0.995317


In [35]:
#查看模型的特征重要性
import matplotlib.pyplot as plt 
from matplotlib import cm
score = pd.DataFrame()
score['fea_name'] = clf.feature_names_
score['fea']=clf.feature_importances_
score = score.sort_values(['fea'], ascending=False)
temp = pd.DataFrame()
temp = score[:320]
color = cm.jet(temp['fea']/temp['fea'].max())
plt.figure(figsize=(10, 50))
plt.barh(temp['fea_name'],temp['fea'],height =0.8,color=color,alpha=0.8)
plt.show()

## 分特征打印特征重要性

In [36]:
# train_df[['app_first_class','app_score','app_second_class','career','city_rank','communication_avgonline_30d']].head(10)

In [37]:
show_list = []
for s in train_df.columns:
    if 'enc_mean' in s:
        show_list.append(s)
print(show_list)
show=score[score['fea_name'].isin(show_list)]
show

['dpt_target_enc_mean', 'make_cn_cartype_target_enc_mean', 'trademark_cn_target_enc_mean', 'brand_cn_target_enc_mean', 'series_target_enc_mean', 'capab_target_enc_mean', 'ncd_ly_target_enc_mean', 'nprem_ly_target_enc_mean', 'make_cn_count_target_enc_mean', 'tp_ratio_target_enc_mean', 'p1_census_register_target_enc_mean', 'p2_client_grade_target_enc_mean', 'active_90_365_ratio_target_enc_mean', 'active_30_365_ratio_target_enc_mean', 'service_score_available_target_enc_mean', 'suiche_nonauto_nprem_19_ratio_target_enc_mean', 'suiche_nonauto_nprem_20_ratio_target_enc_mean', 'active_90_365_diff_target_enc_mean', 'nprem_lyratio_target_enc_mean', 'nprem_20ratio_target_enc_mean', 'regdays_target_enc_mean', 'p1_prior_days_to_insure_target_enc_mean', 'make_cn_cartype_count_target_enc_mean', 'make_cn_cartype_0_count_target_enc_mean', 'personas_count_target_enc_mean', 'newvalue_target_enc_mean', 'bi_renewal_year_target_enc_mean', 'clmnum_target_enc_mean', 'si_tp_target_enc_mean', 'active_30_365_di

Unnamed: 0,fea_name,fea
193,suiche_nonauto_amount_20_count_target_enc_mean,13.048105
176,suiche_nonauto_nprem_20_ratio_target_enc_mean,6.703674
179,nprem_20ratio_target_enc_mean,3.111958
194,suiche_nonauto_amount_19_count_target_enc_mean,2.367506
171,p2_client_grade_target_enc_mean,1.825526
181,p1_prior_days_to_insure_target_enc_mean,1.473869
169,tp_ratio_target_enc_mean,1.297895
166,ncd_ly_target_enc_mean,1.020176
160,dpt_target_enc_mean,0.78423
180,regdays_target_enc_mean,0.683811


## 提交

In [38]:
# !wget -nv -O heywhale_submit https://cdn.kesci.com/submit_tool/v4/heywhale_submit&&chmod +x heywhale_submit
# !./heywhale_submit -token cd406c12be9ca39d -file /home/mw/work/cat_seed_+2020_test_pred_0.9054288129991612.csv