In [1]:
import pandas as pd
from tqdm import tqdm
import warnings
import gc
import os
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from collections import OrderedDict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import time
from itertools import combinations

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')



In [2]:
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

In [3]:
seed = 1024

In [4]:
df_train = pd.read_csv('/home/mw/input/pre8881/train.csv')
# df_test = pd.read_csv('/home/mw/input/pretest_a3048/test_a.csv')
df_test_b = pd.read_csv('/home/mw/input/pretest_b6354/test_b.csv')
df_train['flag'], df_test_b['flag'] = 0, 2
df_train.shape, df_test_b.shape

((684283, 66), (80110, 65))

## 采样

In [5]:
# train_label_0 = df_train_all[ df_train_all['y1_is_purchase'] == 0 ]
# train_label_1 = df_train_all[ df_train_all['y1_is_purchase'] == 1 ]
# print(train_label_0.shape, train_label_1.shape)

# train_label_0 = train_label_0.sample(n = 253759 , random_state = seed).reset_index(drop=True)
# train_label_1 = train_label_1.sample(n = 430524 , random_state = seed).reset_index(drop=True)
# print(train_label_0.shape, train_label_1.shape)

# df_train = pd.concat([train_label_0,train_label_1]).reset_index(drop=True)
# df_train = df_train.sample(frac = 1, random_state = seed+1 ).reset_index(drop=True)
# print(df_train.shape)
# print(df_train['y1_is_purchase'].value_counts())

In [6]:
df_feature = pd.concat([df_train,df_test_b]).reset_index(drop=True)
df_feature = reduce_mem(df_feature)

del df_train, df_test_b
gc.collect()
df_feature.head()

-- Mem. usage decreased to 217.24 Mb (43.6% reduction),time spend:0.01 min


Unnamed: 0,client_no,dpt,xz,xb,carid,nprem_ly,ncd_ly,newvalue,bi_renewal_year,clmnum,regdate,trademark_cn,brand_cn,make_cn,series,capab,seats,use_type,change_owner,nprem_od,si_od,nprem_tp,si_tp,nprem_bt,si_bt,nprem_vld,si_vld,nprem_vlp,si_vlp,p1_prior_days_to_insure,suiche_nonauto_nprem_20,suiche_nonauto_nprem_19,suiche_nonauto_nprem_18,suiche_nonauto_nprem_17,suiche_nonauto_nprem_16,suiche_nonauto_amount_20,suiche_nonauto_amount_19,suiche_nonauto_amount_18,suiche_nonauto_amount_17,suiche_nonauto_amount_16,num_notcar_claim,p1_gender,p1_age,p1_census_register,p2_marital_status,f1_child_flag,f2_posses_house_flag,f2_cust_housing_price_total,p2_client_grade,w1_pc_wx_use_flag,p1_is_bank_eff,p2_is_enterprise_owner,p2_is_smeowner,active_7,active_30,active_90,active_365,p2_is_child_under_15_family,p2_is_adult_over_55_family,birth_month,p1_service_offer_cnt,p3_service_use_cnt,dur_personal_insurance_90,service_score_available,y1_is_purchase,flag
0,5gDljzjQ61m/QeU2tZBgDA==,217,商交,主全,WY4N+MOjfIx8wJ3j6GhlA4qEfL71brEUkqbB0SSdqkI=,1391.0,0.600098,88900,6,0,2010-02-09 00:00:00,福特,长安福特马自达,福特CAF7152A轿车,嘉年华,1.498047,5,非营业,非过户投保,379.0,24892.0,239.25,1000000,0.0,0.0,3.75,10000,9.507812,10000,30.0,,,,,,,,,,,,jh4mxXNEalwumcCWUJdnBw==,56.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,0.0,车主俱乐部-钻石客户-2,是,否,,,0.0,0.0,3.0,32.0,否,是,6月,0.0,0.0,,0.0,0.0,0
1,qTsiFUfrw8gwVOM+LftPvA==,217,商交,主全,DXMuODygH0ddFea7SIoAOhF4134Bx4TPvkkPi6WCdzU=,3740.0,1.0,303000,6,0,2007-02-02 00:00:00,丰田,天津丰田,丰田TV7250RoyalA轿车,皇冠,2.496094,5,非营业,非过户投保,1993.0,60600.0,505.5,1000000,72.375,60600.0,0.0,0,0.0,0,27.0,390.0,0.0,0.0,170.0,170.0,2795000.0,0.0,0.0,120000.0,120000.0,,jh4mxXNEalwumcCWUJdnBw==,52.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,0.0,车主俱乐部-钻石客户-2,是,否,是,是,2.0,4.0,11.0,40.0,否,否,9月,0.0,0.0,,10.0,1.0,0
2,vfTADBw3uqyLukTz5juO0g==,217,商交,主全,waWCEYZJqj9PYxFdVeVLkpCNf/n0BdXPFi1iHlk0WWk=,1454.0,0.600098,132800,6,0,2007-01-01 00:00:00,长城,长城汽车,长城CC6460KM60旅行车,哈弗,2.771484,5,非营业,非过户投保,434.75,26560.0,251.375,1000000,0.0,0.0,0.0,0,0.0,0,30.0,350.0,0.0,0.0,0.0,0.0,1695000.0,0.0,0.0,0.0,0.0,,jh4mxXNEalwumcCWUJdnBw==,55.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,154.625,车主俱乐部-钻石客户-2,否,是,否,否,0.0,0.0,9.0,17.0,否,否,2月,0.0,0.0,,16.0,1.0,0
3,zP5cmQ2nwzLbvocQPmf2YA==,217,商交,主全,nyRm/VviYGDpy2errRWE206SaYkVuqeclusAtXEU9v8=,3526.0,0.850098,316800,5,2,2015-02-09 00:00:00,奥迪,一汽大众,奥迪FV7201BACBG轿车,A6,1.984375,5,非营业,非过户投保,1651.0,202752.0,381.0,1000000,141.625,202752.0,17.921875,30000,45.46875,30000,27.0,9.898438,19.90625,0.0,170.0,0.0,1000000.0,1000000.0,0.0,120000.0,0.0,,jh4mxXNEalwumcCWUJdnBw==,47.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,208.0,车主俱乐部-钻石客户-2,是,是,是,是,0.0,0.0,1.0,7.0,否,否,7月,0.0,0.0,151786.0,0.0,1.0,0
4,+ruD5NLealUAfMZPQd6LEw==,217,单交,单交,LacSDMaoqD0AJRqCeYaGUu343r4NQiVuiFc9hyjLcMI=,522.5,,247800,3,0,2017-12-12 00:00:00,大众,上汽大众,大众汽车SVW6474CED多用途乘用车,途观,1.797852,5,非营业,非过户投保,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,yUh7960km3oydK6Km9rqRA==,52.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,否,是,0.0,车主俱乐部-钻石客户-2,是,否,是,是,0.0,1.0,11.0,30.0,否,否,11月,0.0,0.0,,1.0,1.0,0


## 数据处理

In [7]:
# 异常值，长尾特征处理
# df_feature['p1_prior_days_to_insure'] = df_feature['p1_prior_days_to_insure'].apply(lambda x: -1 if x<0 else x)
# df_feature['p1_prior_days_to_insure'] = df_feature['p1_prior_days_to_insure'].apply(lambda x: 91 if x>90 else x)
# df_feature['p1_age'] = df_feature['p1_age'].apply(lambda x: 80 if x>=80 else x)
# df_feature['p1_age'] = df_feature['p1_age'].apply(lambda x: 18 if x<=18 else x)
# df_feature['p1_service_offer_cnt'] = df_feature['p1_service_offer_cnt'].apply(lambda x: 0 if x>=100 or x<0 else x)
# df_feature['p1_service_offer_cnt'] = df_feature['p1_service_offer_cnt'].apply(lambda x: 25 if x>=25 else x)
# df_feature['p1_service_offer_cnt'] = df_feature['p1_service_offer_cnt'].apply(lambda x: 20 if x>=20 and x<25 else x)
# df_feature['p3_service_use_cnt'] = df_feature['p3_service_use_cnt'].apply(lambda x: 0 if x>=80 or x<0 else x)
# df_feature['p3_service_use_cnt'] = df_feature['p3_service_use_cnt'].apply(lambda x: 15 if x>=15 else x)
# df_feature['p3_service_use_cnt'] = df_feature['p3_service_use_cnt'].apply(lambda x: 10 if x>=10 and x<15 else x)
# df_feature['p2_client_grade'].value_counts()

## 特征工程

In [8]:
df_feature.loc[df_feature['p2_client_grade'].isna(), 'p2_client_grade'] = 0
df_feature.loc[df_feature['p2_client_grade']=='车主俱乐部-黑钻客户-2', 'p2_client_grade'] = 1
df_feature.loc[df_feature['p2_client_grade']=='车主俱乐部-钻石客户-2', 'p2_client_grade'] = 2
df_feature.loc[df_feature['p2_client_grade']=='车主俱乐部-铂金客户-2', 'p2_client_grade'] = 3
df_feature.loc[df_feature['p2_client_grade']=='车主俱乐部-黄金客户-2', 'p2_client_grade'] = 4
df_feature.loc[df_feature['p2_client_grade']=='车主俱乐部-白银客户-2', 'p2_client_grade'] = 5
df_feature.loc[df_feature['p2_client_grade']=='车主俱乐部-黄铜客户-2', 'p2_client_grade'] = 6
df_feature['p2_client_grade'] = df_feature['p2_client_grade'].astype('int')

In [9]:
df_feature['make_cn_cartype'] = df_feature['make_cn'].str.replace('([A-Za-z0-9()-ⅡⅢ]+)', ' ')
df_feature['make_cn_cartype_0'] = df_feature['make_cn_cartype'].apply(lambda x: x.split(' ')[0])
df_feature['make_cn_cartype_1'] = df_feature['make_cn_cartype'].apply(lambda x: x.split(' ')[-1])
print(df_feature['make_cn'].nunique(), df_feature['make_cn_cartype'].nunique(), df_feature['make_cn_cartype_0'].nunique(), df_feature['make_cn_cartype_1'].nunique())
df_feature[['brand_cn', 'make_cn_cartype_0', 'make_cn_cartype_1']][:5]

11615 1035 449 60


Unnamed: 0,brand_cn,make_cn_cartype_0,make_cn_cartype_1
0,长安福特马自达,福特,轿车
1,天津丰田,丰田,轿车
2,长城汽车,长城,旅行车
3,一汽大众,奥迪,轿车
4,上汽大众,大众汽车,多用途乘用车


In [10]:
# 老人或小孩家庭
df_feature['p2_is_child_old_family'] = '否'
df_feature['p2_is_child_old_family']=np.where((df_feature['p2_is_child_under_15_family'] == '是') | (df_feature['p2_is_adult_over_55_family'] == '是'),'是',df_feature['p2_is_child_old_family'])
df_feature['p2_is_child_old_family'].value_counts()

否    656986
是    107407
Name: p2_is_child_old_family, dtype: int64

In [11]:
df_feature['personas'] = df_feature['p1_gender'].astype(str)+'_'+df_feature['p2_marital_status'].astype(str)+'_'+df_feature['f1_child_flag'].astype(str)+'_'+\
                        df_feature['f2_posses_house_flag'].astype(str)+'_'+df_feature['w1_pc_wx_use_flag'].astype(str)+'_'+df_feature['p1_is_bank_eff'].astype(str)
df_feature['xz_xb_co_15_55'] = df_feature['xz'].astype(str)+'_'+df_feature['xb'].astype(str)+'_'+df_feature['change_owner'].astype(str)+'_'+\
                        df_feature['p2_is_child_under_15_family'].astype(str)+'_'+df_feature['p2_is_adult_over_55_family'].astype(str)
df_feature['personas_cno_count'] = df_feature.groupby('personas')['client_no'].transform('count') 
# df_feature['xz_xb_co_15_55_cno_count'] = df_feature.groupby('xz_xb_co_15_55')['client_no'].transform('count') 

In [12]:
df_feature['regdays'] = (pd.to_datetime('2021-1-31') - pd.to_datetime(df_feature['regdate'])) / pd.Timedelta(days=1)
df_feature['npremly_car_value_ratio'] = df_feature['nprem_ly'] / df_feature['newvalue']
df_feature['p3_service_use_ratio'] = df_feature['p3_service_use_cnt'] / df_feature['p1_service_offer_cnt']
df_feature['car_housing_value_ratio'] = df_feature['newvalue'] / df_feature['f2_cust_housing_price_total']
# df_feature.head()

In [13]:
df_feature['od_ratio'] = df_feature['nprem_od'] / df_feature['si_od']
df_feature['tp_ratio'] = df_feature['nprem_tp'] / df_feature['si_tp']
df_feature['bt_ratio'] = df_feature['nprem_bt'] / df_feature['si_bt'] # just so so
df_feature['vld_ratio'] = df_feature['nprem_vld'] / df_feature['si_vld']
df_feature['vlp_ratio'] = df_feature['nprem_vlp'] / df_feature['si_vlp']
df_feature['od_tp_ratio_2ord'] = df_feature['od_ratio'] / df_feature['tp_ratio']
df_feature['od_tp_ratio_2diff'] = df_feature['od_ratio'] - df_feature['tp_ratio']

df_feature['nprem_tot'] = df_feature['nprem_od'] + df_feature['nprem_tp'] + df_feature['nprem_bt'] + df_feature['nprem_vld'] + df_feature['nprem_vlp']
df_feature['nprem_od_percent'] = df_feature['nprem_od'] / df_feature['nprem_tot']
df_feature['nprem_tp_percent'] = df_feature['nprem_tp'] / df_feature['nprem_tot']
df_feature['nprem_odtp_percent_diff'] = df_feature['nprem_od_percent'] - df_feature['nprem_tp_percent']
df_feature['nprem_odtp_percent_add'] = (df_feature['nprem_od'] + df_feature['nprem_tp']) / df_feature['nprem_tot']

df_feature['nprem_20diff'] = df_feature['nprem_tot'] - df_feature['suiche_nonauto_nprem_20']
df_feature['nprem_20ratio'] = df_feature['nprem_tot'] / df_feature['suiche_nonauto_nprem_20']
df_feature['nprem_lydiff'] = df_feature['nprem_tot'] - df_feature['nprem_ly']
df_feature['nprem_lyratio'] = df_feature['nprem_tot'] / df_feature['nprem_ly']
# df_feature['20_lydiff'] = df_feature['suiche_nonauto_nprem_20'] - df_feature['nprem_ly']
# df_feature['20_lyratio'] = df_feature['suiche_nonauto_nprem_20'] / df_feature['nprem_ly']

df_feature['npremtot_car_value_ratio'] = df_feature['nprem_tot'] / df_feature['newvalue']
df_feature['npremtot_housing_value_ratio'] = df_feature['nprem_tot'] / df_feature['f2_cust_housing_price_total']

In [14]:
df_feature['suiche_nonauto_nprem_20_ydiff'] = df_feature['suiche_nonauto_nprem_20'] - df_feature['suiche_nonauto_nprem_19']
df_feature['suiche_nonauto_nprem_19_2ydiff'] = df_feature['suiche_nonauto_nprem_19'] - df_feature['suiche_nonauto_nprem_17']
df_feature['suiche_nonauto_nprem_17_ydiff'] = df_feature['suiche_nonauto_nprem_17'] - df_feature['suiche_nonauto_nprem_16']
df_feature['suiche_nonauto_nprem_20_yratio'] = df_feature['suiche_nonauto_nprem_20'] / df_feature['suiche_nonauto_nprem_19']
df_feature['suiche_nonauto_nprem_19_2yratio'] = df_feature['suiche_nonauto_nprem_19'] / df_feature['suiche_nonauto_nprem_17']
df_feature['suiche_nonauto_nprem_17_yratio'] = df_feature['suiche_nonauto_nprem_17'] / df_feature['suiche_nonauto_nprem_16']
df_feature['suiche_nonauto_nprem_20_19_yratio_2ord'] = df_feature['suiche_nonauto_nprem_20_yratio'] / df_feature['suiche_nonauto_nprem_19_2yratio']
df_feature['suiche_nonauto_nprem_20_17_yratio_2ord'] = df_feature['suiche_nonauto_nprem_20_yratio'] / df_feature['suiche_nonauto_nprem_17_yratio']

df_feature['suiche_nonauto_nprem_20_ratio'] = df_feature['suiche_nonauto_amount_20'] / df_feature['suiche_nonauto_nprem_20']
df_feature['suiche_nonauto_nprem_19_ratio'] = df_feature['suiche_nonauto_amount_19'] / df_feature['suiche_nonauto_nprem_19']
# df_feature['suiche_nonauto_nprem_18_ratio'] = df_feature['suiche_nonauto_amount_18'] / df_feature['suiche_nonauto_nprem_18'] #该年数据有异常
df_feature['suiche_nonauto_nprem_17_ratio'] = df_feature['suiche_nonauto_amount_17'] / df_feature['suiche_nonauto_nprem_17']
df_feature['suiche_nonauto_nprem_20_19_ratio_2diff'] = df_feature['suiche_nonauto_nprem_20_ratio'] - df_feature['suiche_nonauto_nprem_19_ratio']
df_feature['suiche_nonauto_nprem_20_19_ratio_2ord'] = df_feature['suiche_nonauto_nprem_20_ratio'] / df_feature['suiche_nonauto_nprem_19_ratio']
# df_feature['suiche_nonauto_nprem_19_17_ratio_2diff'] = df_feature['suiche_nonauto_nprem_19_ratio'] - df_feature['suiche_nonauto_nprem_17_ratio']
# df_feature['suiche_nonauto_nprem_19_17_ratio_2ord'] = df_feature['suiche_nonauto_nprem_19_ratio'] / df_feature['suiche_nonauto_nprem_17_ratio']
# df_feature['suiche_nonauto_nprem_20_17_ratio_3ord'] = df_feature['suiche_nonauto_nprem_20_19_ratio_2ord'] / df_feature['suiche_nonauto_nprem_19_17_ratio_2ord']
# df_feature['suiche_nonauto_nprem_20_17_ratio_3diff'] = df_feature['suiche_nonauto_nprem_20_19_ratio_2diff'] - df_feature['suiche_nonauto_nprem_19_17_ratio_2diff']

df_feature['active_7_30_ratio'] = (df_feature['active_7'] / 7) / (df_feature['active_30'] / 30) # just so so
df_feature['active_30_90_ratio'] = (df_feature['active_30'] / 30) / (df_feature['active_90'] / 90)
df_feature['active_30_90_diff'] = (df_feature['active_30'] / 30) - (df_feature['active_90'] / 90)
df_feature['active_30_365_ratio'] = (df_feature['active_30'] / 30) / (df_feature['active_365'] / 365)
df_feature['active_30_365_diff'] = (df_feature['active_30'] / 30) - (df_feature['active_365'] / 365)
df_feature['active_90_365_ratio'] = (df_feature['active_90'] / 90) / (df_feature['active_365'] / 365)
df_feature['active_90_365_diff'] = (df_feature['active_90'] / 90) - (df_feature['active_365'] / 365)

In [15]:
# 计数 count编码
count_fea_list = [['p1_census_register'], ['trademark_cn'], ['brand_cn'], ['series'], ['capab'], ['make_cn'], ['make_cn_cartype'], ['make_cn_cartype_0'], ['make_cn_cartype_1']]
count_fea_list += [['tp_ratio'], ['p1_age'], ['service_score_available'], ['nprem_tp'], ['suiche_nonauto_amount_20'], ['suiche_nonauto_amount_19'], ['nprem_ly'], ['p1_prior_days_to_insure']]
count_fea_list += [['personas'], ['xz_xb_co_15_55']]

for f in count_fea_list:
    df_temp = df_feature.groupby(f).size().reset_index()
    df_temp.columns = f + ['{}_count'.format('_'.join(f))]
    df_feature = df_feature.merge(df_temp, how='left')
    # 等价写法
    # df_feature[f + '_count'] = df_feature[f].map(df_feature[f].value_counts())

In [16]:
df_feature['birth_month'] = df_feature['birth_month'].apply(lambda x: int(x[:-1]) if type(x) != float else 0)
df_feature['reg_year'] = df_feature['regdate'].apply(lambda x: int(x[:4]) if type(x) != float else 0)
df_feature['reg_month'] = df_feature['regdate'].apply(lambda x: int(x[5:7]) if type(x) != float else 0)
df_feature['reg_day'] = df_feature['regdate'].apply(lambda x: int(x[8:9]) if type(x) != float else 0)

In [17]:
df_feature[~df_feature['y1_is_purchase'].isnull()].head()

Unnamed: 0,client_no,dpt,xz,xb,carid,nprem_ly,ncd_ly,newvalue,bi_renewal_year,clmnum,regdate,trademark_cn,brand_cn,make_cn,series,capab,seats,use_type,change_owner,nprem_od,si_od,nprem_tp,si_tp,nprem_bt,si_bt,nprem_vld,si_vld,nprem_vlp,si_vlp,p1_prior_days_to_insure,suiche_nonauto_nprem_20,suiche_nonauto_nprem_19,suiche_nonauto_nprem_18,suiche_nonauto_nprem_17,suiche_nonauto_nprem_16,suiche_nonauto_amount_20,suiche_nonauto_amount_19,suiche_nonauto_amount_18,suiche_nonauto_amount_17,suiche_nonauto_amount_16,num_notcar_claim,p1_gender,p1_age,p1_census_register,p2_marital_status,f1_child_flag,f2_posses_house_flag,f2_cust_housing_price_total,p2_client_grade,w1_pc_wx_use_flag,p1_is_bank_eff,p2_is_enterprise_owner,p2_is_smeowner,active_7,active_30,active_90,active_365,p2_is_child_under_15_family,p2_is_adult_over_55_family,birth_month,p1_service_offer_cnt,p3_service_use_cnt,dur_personal_insurance_90,service_score_available,y1_is_purchase,flag,make_cn_cartype,make_cn_cartype_0,make_cn_cartype_1,p2_is_child_old_family,personas,xz_xb_co_15_55,personas_cno_count,regdays,npremly_car_value_ratio,p3_service_use_ratio,car_housing_value_ratio,od_ratio,tp_ratio,bt_ratio,vld_ratio,vlp_ratio,od_tp_ratio_2ord,od_tp_ratio_2diff,nprem_tot,nprem_od_percent,nprem_tp_percent,nprem_odtp_percent_diff,nprem_odtp_percent_add,nprem_20diff,nprem_20ratio,nprem_lydiff,nprem_lyratio,npremtot_car_value_ratio,npremtot_housing_value_ratio,suiche_nonauto_nprem_20_ydiff,suiche_nonauto_nprem_19_2ydiff,suiche_nonauto_nprem_17_ydiff,suiche_nonauto_nprem_20_yratio,suiche_nonauto_nprem_19_2yratio,suiche_nonauto_nprem_17_yratio,suiche_nonauto_nprem_20_19_yratio_2ord,suiche_nonauto_nprem_20_17_yratio_2ord,suiche_nonauto_nprem_20_ratio,suiche_nonauto_nprem_19_ratio,suiche_nonauto_nprem_17_ratio,suiche_nonauto_nprem_20_19_ratio_2diff,suiche_nonauto_nprem_20_19_ratio_2ord,active_7_30_ratio,active_30_90_ratio,active_30_90_diff,active_30_365_ratio,active_30_365_diff,active_90_365_ratio,active_90_365_diff,p1_census_register_count,trademark_cn_count,brand_cn_count,series_count,capab_count,make_cn_count,make_cn_cartype_count,make_cn_cartype_0_count,make_cn_cartype_1_count,tp_ratio_count,p1_age_count,service_score_available_count,nprem_tp_count,suiche_nonauto_amount_20_count,suiche_nonauto_amount_19_count,nprem_ly_count,p1_prior_days_to_insure_count,personas_count,xz_xb_co_15_55_count,reg_year,reg_month,reg_day
0,5gDljzjQ61m/QeU2tZBgDA==,217,商交,主全,WY4N+MOjfIx8wJ3j6GhlA4qEfL71brEUkqbB0SSdqkI=,1391.0,0.600098,88900,6,0,2010-02-09 00:00:00,福特,长安福特马自达,福特CAF7152A轿车,嘉年华,1.498047,5,非营业,非过户投保,379.0,24892.0,239.25,1000000,0.0,0.0,3.75,10000,9.507812,10000,30.0,,,,,,,,,,,,jh4mxXNEalwumcCWUJdnBw==,56.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,0.0,2,是,否,,,0.0,0.0,3.0,32.0,否,是,6,0.0,0.0,,0.0,0.0,0,福特 轿车,福特,轿车,是,jh4mxXNEalwumcCWUJdnBw==_eNP+WqbTmmD3bj49nIcSe...,商交_主全_非过户投保_否_是,12800,4009.0,0.015647,,inf,0.015226,0.000239,,0.000375,0.000951,63.639605,0.014987,631.5,0.600098,0.378906,0.221191,0.978516,,,-759.5,0.454102,0.007103,inf,,,,,,,,,,,,,,,0.0,-0.033325,0.0,-0.087671,0.380116,-0.054346,6193.0,29781,15325,2014.0,45294,60,7207,8583,518484,361.0,9339.0,345003.0,363,,,123,130634.0,12800,50781,2010,2,0
1,qTsiFUfrw8gwVOM+LftPvA==,217,商交,主全,DXMuODygH0ddFea7SIoAOhF4134Bx4TPvkkPi6WCdzU=,3740.0,1.0,303000,6,0,2007-02-02 00:00:00,丰田,天津丰田,丰田TV7250RoyalA轿车,皇冠,2.496094,5,非营业,非过户投保,1993.0,60600.0,505.5,1000000,72.375,60600.0,0.0,0,0.0,0,27.0,390.0,0.0,0.0,170.0,170.0,2795000.0,0.0,0.0,120000.0,120000.0,,jh4mxXNEalwumcCWUJdnBw==,52.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,0.0,2,是,否,是,是,2.0,4.0,11.0,40.0,否,否,9,0.0,0.0,,10.0,1.0,0,丰田 轿车,丰田,轿车,否,jh4mxXNEalwumcCWUJdnBw==_eNP+WqbTmmD3bj49nIcSe...,商交_主全_非过户投保_否_否,12800,5112.0,0.012343,,inf,0.032888,0.000505,0.001194,,,65.059922,0.032382,2570.0,0.775391,0.196655,0.578613,0.972168,2180.0,6.589844,-1170.0,0.687012,0.008482,inf,390.0,-170.0,0.0,inf,0.0,1.0,inf,inf,7166.666504,,705.882324,,,2.142578,1.09082,0.011108,1.21637,0.023712,1.115005,0.012603,6193.0,40075,21400,1203.0,5192,165,29924,38719,518484,526.0,15440.0,16106.0,20,21152.0,293955.0,258,17549.0,12800,418644,2007,2,0
2,vfTADBw3uqyLukTz5juO0g==,217,商交,主全,waWCEYZJqj9PYxFdVeVLkpCNf/n0BdXPFi1iHlk0WWk=,1454.0,0.600098,132800,6,0,2007-01-01 00:00:00,长城,长城汽车,长城CC6460KM60旅行车,哈弗,2.771484,5,非营业,非过户投保,434.75,26560.0,251.375,1000000,0.0,0.0,0.0,0,0.0,0,30.0,350.0,0.0,0.0,0.0,0.0,1695000.0,0.0,0.0,0.0,0.0,,jh4mxXNEalwumcCWUJdnBw==,55.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,154.625,2,否,是,否,否,0.0,0.0,9.0,17.0,否,否,2,0.0,0.0,,16.0,1.0,0,长城 旅行车,长城,旅行车,否,jh4mxXNEalwumcCWUJdnBw==_eNP+WqbTmmD3bj49nIcSe...,商交_主全_非过户投保_否_否,4143,5144.0,0.010949,,858.852061,0.016369,0.000251,,,,65.116259,0.016117,686.0,0.633789,0.366455,0.267334,1.0,336.0,1.959961,-768.0,0.471924,0.005166,4.4375,350.0,0.0,0.0,inf,,,,,4842.856934,,,,,,0.0,-0.099976,0.0,-0.046575,2.146535,0.0534,6193.0,25550,25842,20960.0,981,3,3337,11440,6702,932.0,10131.0,2120.0,936,18735.0,293955.0,64,130634.0,4143,418644,2007,1,0
3,zP5cmQ2nwzLbvocQPmf2YA==,217,商交,主全,nyRm/VviYGDpy2errRWE206SaYkVuqeclusAtXEU9v8=,3526.0,0.850098,316800,5,2,2015-02-09 00:00:00,奥迪,一汽大众,奥迪FV7201BACBG轿车,A6,1.984375,5,非营业,非过户投保,1651.0,202752.0,381.0,1000000,141.625,202752.0,17.921875,30000,45.46875,30000,27.0,9.898438,19.90625,0.0,170.0,0.0,1000000.0,1000000.0,0.0,120000.0,0.0,,jh4mxXNEalwumcCWUJdnBw==,47.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,208.0,2,是,是,是,是,0.0,0.0,1.0,7.0,否,否,7,0.0,0.0,151786.0,0.0,1.0,0,奥迪 轿车,奥迪,轿车,否,jh4mxXNEalwumcCWUJdnBw==_eNP+WqbTmmD3bj49nIcSe...,商交_主全_非过户投保_否_否,6513,2183.0,0.01113,,1523.076923,0.008143,0.000381,0.000699,0.000597,0.001516,21.37258,0.007762,2238.0,0.737793,0.170288,0.567383,0.907715,2228.0,226.125,-1288.0,0.634766,0.007064,10.757812,-10.007812,-150.125,170.0,0.497314,0.117065,inf,4.25,0.0,101026.046875,50235.480469,705.882324,50790.566406,2.01105,,0.0,-0.011108,0.0,-0.019178,0.579224,-0.00807,6193.0,12178,53772,3437.0,15475,264,8255,12178,518484,27.0,21338.0,345003.0,20,85566.0,52519.0,188,17549.0,6513,418644,2015,2,0
4,+ruD5NLealUAfMZPQd6LEw==,217,单交,单交,LacSDMaoqD0AJRqCeYaGUu343r4NQiVuiFc9hyjLcMI=,522.5,,247800,3,0,2017-12-12 00:00:00,大众,上汽大众,大众汽车SVW6474CED多用途乘用车,途观,1.797852,5,非营业,非过户投保,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,yUh7960km3oydK6Km9rqRA==,52.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,否,是,0.0,2,是,否,是,是,0.0,1.0,11.0,30.0,否,否,11,0.0,0.0,,1.0,1.0,0,大众汽车 多用途乘用车,大众汽车,多用途乘用车,否,yUh7960km3oydK6Km9rqRA==_eNP+WqbTmmD3bj49nIcSe...,单交_单交_非过户投保_否_否,8621,1146.0,0.002109,,inf,,,,,,,,0.0,,,,,0.0,,-522.5,0.0,0.0,,0.0,0.0,0.0,,,,,,,,,,,0.0,0.272705,-0.088867,0.405457,-0.048867,1.486674,0.040001,6193.0,96796,10607,7750.0,38063,523,9571,38524,158001,,15440.0,8673.0,96729,216811.0,293955.0,137,55995.0,8621,76268,2017,12,1


In [18]:
# groupby feature
def group_fea(df,key,target):
    tmp = df.groupby(key, as_index=False)[target].agg({
        key+target + '_nunique': 'nunique',
    }).reset_index()
    del tmp['index']
    print("**************************{}**************************".format(target))
    return tmp

feature_key = ['dpt']
feature_target = ['trademark_cn', 'suiche_nonauto_amount_20','service_score_available']
for key in tqdm(feature_key):
    for target in feature_target:
        tmp = group_fea(df_feature,key,target)
        df_feature = df_feature.merge(tmp,on=key,how='left')
        
# tmp = group_fea(df,'spread_app_id','task_id')
# df = df.merge(tmp,on='spread_app_id',how='left')
del tmp
gc.collect()

  0%|          | 0/1 [00:00<?, ?it/s]

**************************trademark_cn**************************
**************************suiche_nonauto_amount_20**************************
**************************service_score_available**************************


100%|██████████| 1/1 [00:08<00:00,  8.43s/it]


0

In [19]:
# dpt 统计数值类特征的均值
num_fea_list = ['service_score_available', 'dur_personal_insurance_90']
num_fea_list += ['active_365','ncd_ly', 'bi_renewal_year', 'clmnum', 'seats']
num_fea_list += ['regdays', 'od_ratio', 'p2_client_grade', 'od_tp_ratio_2ord', 'nprem_lyratio']
num_fea_list += ['bi_renewal_year', 'nprem_lyratio', 'si_tp', 'suiche_nonauto_nprem_20_ydiff']
for col in num_fea_list:
    df_feature[f'dpt{col}_mean'] = df_feature.groupby('dpt')[col].transform('mean')

del num_fea_list
# personas,  统计数值类特征的均值
num_fea_list = ['active_365', 'service_score_available', 'dur_personal_insurance_90', 'f2_cust_housing_price_total', 'suiche_nonauto_nprem_20_ydiff']
num_fea_list += ['regdays', 'tp_ratio', 'od_ratio', 'p2_client_grade', 'active_90_365_ratio', 'nprem_ly', 'ncd_ly', 'si_tp', 'suiche_nonauto_nprem_20_ydiff']
num_fea_list += ['od_tp_ratio_2ord', 'nprem_lyratio', 'newvalue', 'bi_renewal_year', 'clmnum', 'capab', 'seats', 'p2_client_grade']
for col in num_fea_list:
    df_feature['personas_{}_mean'.format(col)] = df_feature.groupby('personas')[col].transform('mean')

In [20]:
# feature = pd.DataFrame()
# to_group = [['suiche_nonauto_nprem_20']]
# to_inter = ['p1_prior_days_to_insure']
# to_calc = [
#     'mean',
#     'std',
#     'nunique',
# ]

# for i in tqdm(to_group):
#     for j in to_inter:
#         for k in to_calc:
#             feature["STAT_{}_{}_{}".format("_".join(i),j,k)] = df_feature[i + [j]].groupby(i)[j].transform(k)
            
# print(feature.shape)
# feature.reset_index(drop=True, inplace=True)
# df_feature[feature.columns] = feature
# del feature
# gc.collect()
# df_feature.shape

In [21]:
for agg in ['mean']:
    df_feature[f'dpt_ncd_ly_p1_prior_days_to_insure_2ord{agg}'] = df_feature.groupby(['dpt', 'ncd_ly'])['p1_prior_days_to_insure'].transform(agg)
    df_feature[f'dpt_ncd_ly_active_90_365_ratio_2ord{agg}'] = df_feature.groupby(['dpt', 'ncd_ly'])['active_90_365_ratio'].transform(agg)
    df_feature[f'dpt_ncd_ly_tp_ratio_2ord{agg}'] = df_feature.groupby(['dpt', 'ncd_ly'])['tp_ratio'].transform(agg)
    df_feature[f'dpt_ncd_ly_suiche_nonauto_nprem_20_ydiff_2ord{agg}'] = df_feature.groupby(['dpt', 'ncd_ly'])['suiche_nonauto_nprem_20_ydiff'].transform(agg)
    df_feature[f'dpt_ncd_ly_suiche_nonauto_nprem_20_ratio_2ord{agg}'] = df_feature.groupby(['dpt', 'ncd_ly'])['suiche_nonauto_nprem_20_ratio'].transform(agg)

    df_feature[f'dpt_p1_census_register_active_90_365_ratio_2ord{agg}'] = df_feature.groupby(['dpt', 'p1_census_register'])['active_90_365_ratio'].transform(agg)
    df_feature[f'dpt_p1_census_register_p1_prior_days_to_insure_2ord{agg}'] = df_feature.groupby(['dpt', 'p1_census_register'])['p1_prior_days_to_insure'].transform(agg)
    df_feature[f'dpt_p1_census_register_tp_ratio_2ord{agg}'] = df_feature.groupby(['dpt', 'p1_census_register'])['tp_ratio'].transform(agg)
    df_feature[f'dpt_p1_census_register_regdays_2ord{agg}'] = df_feature.groupby(['dpt', 'p1_census_register'])['regdays'].transform(agg)
    df_feature[f'dpt_p1_census_register_ncd_ly_2ord{agg}'] = df_feature.groupby(['dpt', 'p1_census_register'])['ncd_ly'].transform(agg)
    df_feature[f'dpt_p1_census_register_suiche_nonauto_nprem_20_ydiff_2ord{agg}'] = df_feature.groupby(['dpt', 'p1_census_register'])['suiche_nonauto_nprem_20_ydiff'].transform(agg)
    df_feature[f'dpt_p1_census_register_suiche_nonauto_nprem_20_ratio_2ord{agg}'] = df_feature.groupby(['dpt', 'p1_census_register'])['suiche_nonauto_nprem_20_ratio'].transform(agg)

    df_feature[f'dpt_p2_client_grade_active_90_365_ratio_2ord{agg}'] = df_feature.groupby(['dpt', 'p2_client_grade'])['active_90_365_ratio'].transform(agg)
    df_feature[f'dpt_p2_client_grade_p1_prior_days_to_insure_2ord{agg}'] = df_feature.groupby(['dpt', 'p2_client_grade'])['p1_prior_days_to_insure'].transform(agg)
    df_feature[f'dpt_p2_client_grade_tp_ratio_2ord{agg}'] = df_feature.groupby(['dpt', 'p2_client_grade'])['tp_ratio'].transform(agg)
    df_feature[f'dpt_p2_client_grade_regdays_2ord{agg}'] = df_feature.groupby(['dpt', 'p2_client_grade'])['regdays'].transform(agg)
    df_feature[f'dpt_p2_client_grade_ncd_ly_2ord{agg}'] = df_feature.groupby(['dpt', 'p2_client_grade'])['ncd_ly'].transform(agg)
    df_feature[f'dpt_p2_client_grade_suiche_nonauto_nprem_20_ydiff_2ord{agg}'] = df_feature.groupby(['dpt', 'p2_client_grade'])['suiche_nonauto_nprem_20_ydiff'].transform(agg)
    df_feature[f'dpt_p2_client_grade_suiche_nonauto_nprem_20_ratio_2ord{agg}'] = df_feature.groupby(['dpt', 'p2_client_grade'])['suiche_nonauto_nprem_20_ratio'].transform(agg)

In [22]:
# 5折交叉 目标编码
df_train = df_feature[df_feature['flag']==0]
df_train = df_train.reset_index(drop=True)
df_test = df_feature[df_feature['flag']==2]
print(df_train.shape, df_test.shape)

def n_fold_target_encoding(train_df,test_df,label='label',n=5,enc_list=[],functions=['mean']):
    skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=seed)
    for f in tqdm(enc_list):
        for func in functions:
            train_df[f + f'_target_enc_{func}'] = 0
            test_df[f + f'_target_enc_{func}'] = 0
            for i, (trn_idx, val_idx) in enumerate(skf.split(train_df, train_df[label])):
                trn_x = train_df[[f, label]].iloc[trn_idx].reset_index(drop=True)
                val_x = train_df[[f]].iloc[val_idx].reset_index(drop=True)
                enc_df = trn_x.groupby(f, as_index=False)[label].agg({f + f'_target_enc_{func}': func})
                val_x = val_x.merge(enc_df, on=f, how='left')
                test_x = test_df[[f]].merge(enc_df, on=f, how='left')
                val_x[f + f'_target_enc_{func}'] = val_x[f + f'_target_enc_{func}'].fillna(train_df[label].agg(func))
                test_x[f + f'_target_enc_{func}'] = test_x[f + f'_target_enc_{func}'].fillna(train_df[label].agg(func))
                train_df.loc[val_idx, f + f'_target_enc_{func}'] = val_x[f + f'_target_enc_{func}'].values
                test_df[f + f'_target_enc_{func}'] += test_x[f + f'_target_enc_{func}'].values / skf.n_splits
                del(trn_x)
                del(val_x)
                del(enc_df)
                gc.collect()
    return train_df,test_df

target_enc_list = ['dpt', 'make_cn_cartype', 'trademark_cn', 'brand_cn', 'series', 'capab', 'ncd_ly', 'nprem_ly', 'make_cn_count']
target_enc_list += ['tp_ratio', 'p1_census_register', 'p2_client_grade', 'active_90_365_ratio', 'active_30_365_ratio', 'service_score_available']
target_enc_list += ['suiche_nonauto_nprem_19_ratio', 'suiche_nonauto_nprem_20_ratio', 'active_90_365_diff']
target_enc_list += ['nprem_lyratio', 'nprem_20ratio', 'personas', 'regdays', 'p1_prior_days_to_insure']
target_enc_list += ['make_cn_count','make_cn_cartype_count','make_cn_cartype_0_count', 'personas_count']

df_train, df_test = n_fold_target_encoding(df_train,df_test,label='y1_is_purchase',n=10,enc_list=target_enc_list,functions=['mean'])
df_feature = pd.concat([df_train, df_test], axis=0)

  0%|          | 0/27 [00:00<?, ?it/s]

(684283, 193) (80110, 193)


100%|██████████| 27/27 [03:27<00:00,  7.67s/it]


In [23]:
print(df_feature.shape)
df_feature.head()

(764393, 219)


Unnamed: 0,client_no,dpt,xz,xb,carid,nprem_ly,ncd_ly,newvalue,bi_renewal_year,clmnum,regdate,trademark_cn,brand_cn,make_cn,series,capab,seats,use_type,change_owner,nprem_od,si_od,nprem_tp,si_tp,nprem_bt,si_bt,nprem_vld,si_vld,nprem_vlp,si_vlp,p1_prior_days_to_insure,suiche_nonauto_nprem_20,suiche_nonauto_nprem_19,suiche_nonauto_nprem_18,suiche_nonauto_nprem_17,suiche_nonauto_nprem_16,suiche_nonauto_amount_20,suiche_nonauto_amount_19,suiche_nonauto_amount_18,suiche_nonauto_amount_17,suiche_nonauto_amount_16,num_notcar_claim,p1_gender,p1_age,p1_census_register,p2_marital_status,f1_child_flag,f2_posses_house_flag,f2_cust_housing_price_total,p2_client_grade,w1_pc_wx_use_flag,p1_is_bank_eff,p2_is_enterprise_owner,p2_is_smeowner,active_7,active_30,active_90,active_365,p2_is_child_under_15_family,p2_is_adult_over_55_family,birth_month,p1_service_offer_cnt,p3_service_use_cnt,dur_personal_insurance_90,service_score_available,y1_is_purchase,flag,make_cn_cartype,make_cn_cartype_0,make_cn_cartype_1,p2_is_child_old_family,personas,xz_xb_co_15_55,personas_cno_count,regdays,npremly_car_value_ratio,p3_service_use_ratio,car_housing_value_ratio,od_ratio,tp_ratio,bt_ratio,vld_ratio,vlp_ratio,od_tp_ratio_2ord,od_tp_ratio_2diff,nprem_tot,nprem_od_percent,nprem_tp_percent,nprem_odtp_percent_diff,nprem_odtp_percent_add,nprem_20diff,nprem_20ratio,nprem_lydiff,nprem_lyratio,npremtot_car_value_ratio,npremtot_housing_value_ratio,suiche_nonauto_nprem_20_ydiff,suiche_nonauto_nprem_19_2ydiff,suiche_nonauto_nprem_17_ydiff,suiche_nonauto_nprem_20_yratio,suiche_nonauto_nprem_19_2yratio,suiche_nonauto_nprem_17_yratio,suiche_nonauto_nprem_20_19_yratio_2ord,suiche_nonauto_nprem_20_17_yratio_2ord,suiche_nonauto_nprem_20_ratio,suiche_nonauto_nprem_19_ratio,suiche_nonauto_nprem_17_ratio,suiche_nonauto_nprem_20_19_ratio_2diff,suiche_nonauto_nprem_20_19_ratio_2ord,active_7_30_ratio,active_30_90_ratio,active_30_90_diff,active_30_365_ratio,active_30_365_diff,active_90_365_ratio,active_90_365_diff,p1_census_register_count,trademark_cn_count,brand_cn_count,series_count,capab_count,make_cn_count,make_cn_cartype_count,make_cn_cartype_0_count,make_cn_cartype_1_count,tp_ratio_count,p1_age_count,service_score_available_count,nprem_tp_count,suiche_nonauto_amount_20_count,suiche_nonauto_amount_19_count,nprem_ly_count,p1_prior_days_to_insure_count,personas_count,xz_xb_co_15_55_count,reg_year,reg_month,reg_day,dpttrademark_cn_nunique,dptsuiche_nonauto_amount_20_nunique,dptservice_score_available_nunique,dptservice_score_available_mean,dptdur_personal_insurance_90_mean,dptactive_365_mean,dptncd_ly_mean,dptbi_renewal_year_mean,dptclmnum_mean,dptseats_mean,dptregdays_mean,dptod_ratio_mean,dptp2_client_grade_mean,dptod_tp_ratio_2ord_mean,dptnprem_lyratio_mean,dptsi_tp_mean,dptsuiche_nonauto_nprem_20_ydiff_mean,personas_active_365_mean,personas_service_score_available_mean,personas_dur_personal_insurance_90_mean,personas_f2_cust_housing_price_total_mean,personas_suiche_nonauto_nprem_20_ydiff_mean,personas_regdays_mean,personas_tp_ratio_mean,personas_od_ratio_mean,personas_p2_client_grade_mean,personas_active_90_365_ratio_mean,personas_nprem_ly_mean,personas_ncd_ly_mean,personas_si_tp_mean,personas_od_tp_ratio_2ord_mean,personas_nprem_lyratio_mean,personas_newvalue_mean,personas_bi_renewal_year_mean,personas_clmnum_mean,personas_capab_mean,personas_seats_mean,dpt_ncd_ly_p1_prior_days_to_insure_2ordmean,dpt_ncd_ly_active_90_365_ratio_2ordmean,dpt_ncd_ly_tp_ratio_2ordmean,dpt_ncd_ly_suiche_nonauto_nprem_20_ydiff_2ordmean,dpt_ncd_ly_suiche_nonauto_nprem_20_ratio_2ordmean,dpt_p1_census_register_active_90_365_ratio_2ordmean,dpt_p1_census_register_p1_prior_days_to_insure_2ordmean,dpt_p1_census_register_tp_ratio_2ordmean,dpt_p1_census_register_regdays_2ordmean,dpt_p1_census_register_ncd_ly_2ordmean,dpt_p1_census_register_suiche_nonauto_nprem_20_ydiff_2ordmean,dpt_p1_census_register_suiche_nonauto_nprem_20_ratio_2ordmean,dpt_p2_client_grade_active_90_365_ratio_2ordmean,dpt_p2_client_grade_p1_prior_days_to_insure_2ordmean,dpt_p2_client_grade_tp_ratio_2ordmean,dpt_p2_client_grade_regdays_2ordmean,dpt_p2_client_grade_ncd_ly_2ordmean,dpt_p2_client_grade_suiche_nonauto_nprem_20_ydiff_2ordmean,dpt_p2_client_grade_suiche_nonauto_nprem_20_ratio_2ordmean,dpt_target_enc_mean,make_cn_cartype_target_enc_mean,trademark_cn_target_enc_mean,brand_cn_target_enc_mean,series_target_enc_mean,capab_target_enc_mean,ncd_ly_target_enc_mean,nprem_ly_target_enc_mean,make_cn_count_target_enc_mean,tp_ratio_target_enc_mean,p1_census_register_target_enc_mean,p2_client_grade_target_enc_mean,active_90_365_ratio_target_enc_mean,active_30_365_ratio_target_enc_mean,service_score_available_target_enc_mean,suiche_nonauto_nprem_19_ratio_target_enc_mean,suiche_nonauto_nprem_20_ratio_target_enc_mean,active_90_365_diff_target_enc_mean,nprem_lyratio_target_enc_mean,nprem_20ratio_target_enc_mean,personas_target_enc_mean,regdays_target_enc_mean,p1_prior_days_to_insure_target_enc_mean,make_cn_cartype_count_target_enc_mean,make_cn_cartype_0_count_target_enc_mean,personas_count_target_enc_mean
0,5gDljzjQ61m/QeU2tZBgDA==,217,商交,主全,WY4N+MOjfIx8wJ3j6GhlA4qEfL71brEUkqbB0SSdqkI=,1391.0,0.600098,88900,6,0,2010-02-09 00:00:00,福特,长安福特马自达,福特CAF7152A轿车,嘉年华,1.498047,5,非营业,非过户投保,379.0,24892.0,239.25,1000000,0.0,0.0,3.75,10000,9.507812,10000,30.0,,,,,,,,,,,,jh4mxXNEalwumcCWUJdnBw==,56.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,0.0,2,是,否,,,0.0,0.0,3.0,32.0,否,是,6,0.0,0.0,,0.0,0.0,0,福特 轿车,福特,轿车,是,jh4mxXNEalwumcCWUJdnBw==_eNP+WqbTmmD3bj49nIcSe...,商交_主全_非过户投保_否_是,12800,4009.0,0.015647,,inf,0.015226,0.000239,,0.000375,0.000951,63.639605,0.014987,631.5,0.600098,0.378906,0.221191,0.978516,,,-759.5,0.454102,0.007103,inf,,,,,,,,,,,,,,,0.0,-0.033325,0.0,-0.087671,0.380116,-0.054346,6193.0,29781,15325,2014.0,45294,60,7207,8583,518484,361.0,9339.0,345003.0,363,,,123,130634.0,12800,50781,2010,2,0,113,243,3120,764.579773,69417.9375,33.78125,0.744629,3.249175,0.143442,5.290767,2426.453376,0.009034,2.943542,26.303626,0.375,808389.359836,38.625,32.6875,1053.029053,122923.820312,90.5625,53.375,2834.185703,0.000984,0.017096,2.531484,1.774751,2610.0,0.701172,907765.625,20.663652,0.536133,139255.363828,3.774688,0.130859,1.727539,5.314531,21.921875,1.518002,0.000282,57.34375,inf,1.567076,20.109375,0.000349,2610.02945,0.745117,36.6875,33300.691406,1.519506,21.109375,0.00034,2719.242108,0.706055,49.15625,inf,0.467285,0.648926,0.641602,0.61084,0.595215,0.641113,0.70459,0.5,0.61084,0.645508,0.445801,0.670898,0.632324,0.590332,0.61377,,,0.635742,0.472412,,0.661133,0.59375,0.741699,0.648926,0.652832,0.661133
1,qTsiFUfrw8gwVOM+LftPvA==,217,商交,主全,DXMuODygH0ddFea7SIoAOhF4134Bx4TPvkkPi6WCdzU=,3740.0,1.0,303000,6,0,2007-02-02 00:00:00,丰田,天津丰田,丰田TV7250RoyalA轿车,皇冠,2.496094,5,非营业,非过户投保,1993.0,60600.0,505.5,1000000,72.375,60600.0,0.0,0,0.0,0,27.0,390.0,0.0,0.0,170.0,170.0,2795000.0,0.0,0.0,120000.0,120000.0,,jh4mxXNEalwumcCWUJdnBw==,52.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,0.0,2,是,否,是,是,2.0,4.0,11.0,40.0,否,否,9,0.0,0.0,,10.0,1.0,0,丰田 轿车,丰田,轿车,否,jh4mxXNEalwumcCWUJdnBw==_eNP+WqbTmmD3bj49nIcSe...,商交_主全_非过户投保_否_否,12800,5112.0,0.012343,,inf,0.032888,0.000505,0.001194,,,65.059922,0.032382,2570.0,0.775391,0.196655,0.578613,0.972168,2180.0,6.589844,-1170.0,0.687012,0.008482,inf,390.0,-170.0,0.0,inf,0.0,1.0,inf,inf,7166.666504,,705.882324,,,2.142578,1.09082,0.011108,1.21637,0.023712,1.115005,0.012603,6193.0,40075,21400,1203.0,5192,165,29924,38719,518484,526.0,15440.0,16106.0,20,21152.0,293955.0,258,17549.0,12800,418644,2007,2,0,113,243,3120,764.579773,69417.9375,33.78125,0.744629,3.249175,0.143442,5.290767,2426.453376,0.009034,2.943542,26.303626,0.375,808389.359836,38.625,32.6875,1053.029053,122923.820312,90.5625,53.375,2834.185703,0.000984,0.017096,2.531484,1.774751,2610.0,0.701172,907765.625,20.663652,0.536133,139255.363828,3.774688,0.130859,1.727539,5.314531,17.71875,1.517112,0.000494,65.5,inf,1.567076,20.109375,0.000349,2610.02945,0.745117,36.6875,33300.691406,1.519506,21.109375,0.00034,2719.242108,0.706055,49.15625,inf,0.467285,0.640137,0.64502,0.641113,0.631348,0.604004,0.634277,0.724609,0.617676,0.655762,0.447021,0.670898,0.633301,0.671387,0.629395,,0.856445,0.649902,0.709473,0.824219,0.663574,0.526855,0.668457,0.640137,0.646484,0.663574
2,vfTADBw3uqyLukTz5juO0g==,217,商交,主全,waWCEYZJqj9PYxFdVeVLkpCNf/n0BdXPFi1iHlk0WWk=,1454.0,0.600098,132800,6,0,2007-01-01 00:00:00,长城,长城汽车,长城CC6460KM60旅行车,哈弗,2.771484,5,非营业,非过户投保,434.75,26560.0,251.375,1000000,0.0,0.0,0.0,0,0.0,0,30.0,350.0,0.0,0.0,0.0,0.0,1695000.0,0.0,0.0,0.0,0.0,,jh4mxXNEalwumcCWUJdnBw==,55.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,154.625,2,否,是,否,否,0.0,0.0,9.0,17.0,否,否,2,0.0,0.0,,16.0,1.0,0,长城 旅行车,长城,旅行车,否,jh4mxXNEalwumcCWUJdnBw==_eNP+WqbTmmD3bj49nIcSe...,商交_主全_非过户投保_否_否,4143,5144.0,0.010949,,858.852061,0.016369,0.000251,,,,65.116259,0.016117,686.0,0.633789,0.366455,0.267334,1.0,336.0,1.959961,-768.0,0.471924,0.005166,4.4375,350.0,0.0,0.0,inf,,,,,4842.856934,,,,,,0.0,-0.099976,0.0,-0.046575,2.146535,0.0534,6193.0,25550,25842,20960.0,981,3,3337,11440,6702,932.0,10131.0,2120.0,936,18735.0,293955.0,64,130634.0,4143,418644,2007,1,0,113,243,3120,764.579773,69417.9375,33.78125,0.744629,3.249175,0.143442,5.290767,2426.453376,0.009034,2.943542,26.303626,0.375,808389.359836,38.625,31.375,1187.225586,120779.351562,83.3125,54.75,2823.992276,0.000998,0.016798,2.520396,1.717182,2868.0,0.737305,908484.1902,19.949303,0.534668,166271.649529,3.624909,0.186338,1.796875,5.331644,21.921875,1.518002,0.000282,57.34375,inf,1.567076,20.109375,0.000349,2610.02945,0.745117,36.6875,33300.691406,1.519506,21.109375,0.00034,2719.242108,0.706055,49.15625,inf,0.467285,0.65625,0.683594,0.682129,0.69873,0.590332,0.70459,0.480713,0.541992,0.593262,0.447021,0.670898,0.668457,0.590332,0.657715,,0.839355,0.66748,0.549805,0.600098,0.601074,0.388672,0.741699,0.65625,0.644531,0.601074
3,zP5cmQ2nwzLbvocQPmf2YA==,217,商交,主全,nyRm/VviYGDpy2errRWE206SaYkVuqeclusAtXEU9v8=,3526.0,0.850098,316800,5,2,2015-02-09 00:00:00,奥迪,一汽大众,奥迪FV7201BACBG轿车,A6,1.984375,5,非营业,非过户投保,1651.0,202752.0,381.0,1000000,141.625,202752.0,17.921875,30000,45.46875,30000,27.0,9.898438,19.90625,0.0,170.0,0.0,1000000.0,1000000.0,0.0,120000.0,0.0,,jh4mxXNEalwumcCWUJdnBw==,47.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,是,是,208.0,2,是,是,是,是,0.0,0.0,1.0,7.0,否,否,7,0.0,0.0,151786.0,0.0,1.0,0,奥迪 轿车,奥迪,轿车,否,jh4mxXNEalwumcCWUJdnBw==_eNP+WqbTmmD3bj49nIcSe...,商交_主全_非过户投保_否_否,6513,2183.0,0.01113,,1523.076923,0.008143,0.000381,0.000699,0.000597,0.001516,21.37258,0.007762,2238.0,0.737793,0.170288,0.567383,0.907715,2228.0,226.125,-1288.0,0.634766,0.007064,10.757812,-10.007812,-150.125,170.0,0.497314,0.117065,inf,4.25,0.0,101026.046875,50235.480469,705.882324,50790.566406,2.01105,,0.0,-0.011108,0.0,-0.019178,0.579224,-0.00807,6193.0,12178,53772,3437.0,15475,264,8255,12178,518484,27.0,21338.0,345003.0,20,85566.0,52519.0,188,17549.0,6513,418644,2015,2,0,113,243,3120,764.579773,69417.9375,33.78125,0.744629,3.249175,0.143442,5.290767,2426.453376,0.009034,2.943542,26.303626,0.375,808389.359836,38.625,38.84375,1596.521973,143701.59375,87.0,71.375,2848.344542,0.000976,0.01699,2.407646,1.668997,2852.0,0.723145,932097.343774,20.273459,0.545898,161642.198833,3.717642,0.152618,1.791016,5.369876,18.296875,1.554707,0.000406,32.25,32124.91,1.567076,20.109375,0.000349,2610.02945,0.745117,36.6875,33300.691406,1.519506,21.109375,0.00034,2719.242108,0.706055,49.15625,inf,0.470947,0.583008,0.593262,0.65332,0.59082,0.62207,0.673828,0.763184,0.664551,0.222168,0.447998,0.670898,0.581543,0.589844,0.61377,0.523926,0.417969,0.57666,0.735352,0.428467,0.639648,0.686035,0.666016,0.583008,0.593262,0.639648
4,+ruD5NLealUAfMZPQd6LEw==,217,单交,单交,LacSDMaoqD0AJRqCeYaGUu343r4NQiVuiFc9hyjLcMI=,522.5,,247800,3,0,2017-12-12 00:00:00,大众,上汽大众,大众汽车SVW6474CED多用途乘用车,途观,1.797852,5,非营业,非过户投保,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,yUh7960km3oydK6Km9rqRA==,52.0,Mk+Y/3ew22P1DY8uqPLGGCIFQPo4OFXgq8CuM+8YhMo=,eNP+WqbTmmD3bj49nIcSew==,否,是,0.0,2,是,否,是,是,0.0,1.0,11.0,30.0,否,否,11,0.0,0.0,,1.0,1.0,0,大众汽车 多用途乘用车,大众汽车,多用途乘用车,否,yUh7960km3oydK6Km9rqRA==_eNP+WqbTmmD3bj49nIcSe...,单交_单交_非过户投保_否_否,8621,1146.0,0.002109,,inf,,,,,,,,0.0,,,,,0.0,,-522.5,0.0,0.0,,0.0,0.0,0.0,,,,,,,,,,,0.0,0.272705,-0.088867,0.405457,-0.048867,1.486674,0.040001,6193.0,96796,10607,7750.0,38063,523,9571,38524,158001,,15440.0,8673.0,96729,216811.0,293955.0,137,55995.0,8621,76268,2017,12,1,113,243,3120,764.579773,69417.9375,33.78125,0.744629,3.249175,0.143442,5.290767,2426.453376,0.009034,2.943542,26.303626,0.375,808389.359836,38.625,28.453125,773.13446,89221.382812,122.5,43.53125,2806.720218,0.001058,0.017194,2.60747,1.83575,2732.0,0.70752,872004.407841,19.193571,0.54834,150723.386846,3.742025,0.138383,1.719727,5.132467,,,,,,1.567076,20.109375,0.000349,2610.02945,0.745117,36.6875,33300.691406,1.519506,21.109375,0.00034,2719.242108,0.706055,49.15625,inf,0.467285,0.705566,0.664062,0.689941,0.713867,0.668457,,0.371582,0.728027,,0.445801,0.670898,0.637207,0.615234,0.643555,,,0.63623,0.265869,,0.658203,0.369629,0.438477,0.705566,0.693359,0.658203


## 模型训练

In [24]:
# for f in list(df_feature.select_dtypes('object')):
#     if f in ['carid', 'regdate']:
#         continue
#     le = LabelEncoder()
#     df_feature[f] = le.fit_transform(
#         df_feature[f].astype('str')).astype('int')

In [25]:
df_train = df_feature[df_feature['flag']==0]
df_test = df_feature[df_feature['flag']==2]
df_train = reduce_mem(df_train)
df_test = reduce_mem(df_test)
gc.collect()

-- Mem. usage decreased to 508.36 Mb (34.4% reduction),time spend:0.10 min
-- Mem. usage decreased to 59.36 Mb (34.6% reduction),time spend:0.01 min


0

In [26]:
# print('开始储存train_label')
# train_label = pd.DataFrame()
# train_label['carid'] = df_train['carid']
# train_label['probability'] = df_train['y1_is_purchase'].values
# train_label.to_csv('train_label_'+str(seed), index = False)
# print(train_label.shape)
# train_label.head()

In [29]:
ycol = 'y1_is_purchase'
drop_fea_list = [ycol, 'regdate', 'carid', 'use_type', 'suiche_nonauto_nprem_18', 'suiche_nonauto_amount_18', 'client_no', 'num_notcar_claim', 'flag']
drop_fea_list += ['xz', 'xb', 'trademark_cn', 'brand_cn', 'make_cn', 'series', 'change_owner', 'p1_gender', 'p1_census_register', 'p2_marital_status', 'f1_child_flag', 'f2_posses_house_flag', 'w1_pc_wx_use_flag', 'p1_is_bank_eff', 'p2_is_enterprise_owner', 'p2_is_smeowner', 'p2_is_child_under_15_family', 'p2_is_adult_over_55_family', 'make_cn_cartype', 'make_cn_cartype_0', 'make_cn_cartype_1', 'p2_is_child_old_family', 'personas', 'xz_xb_co_15_55']
# drop_fea_list += ['si_od', 'si_tp', 'si_bt', 'si_vld', 'si_vlp'] # 线下下降三个万
feature_names = list(
    filter(lambda x: x not in drop_fea_list, df_train.columns))

model = lgb.LGBMClassifier(num_leaves=200,
                           max_depth=9,
                           learning_rate=0.03,
                           n_estimators=10000,
                        #    subsample=0.9,
                           feature_fraction=0.6,
                        #    lambda_l1=0.1,
                        #    lambda_l2=0.1,
                        #    min_child_weight=30,
                           bagging_fraction=0.9,
                        #    bagging_freq=5,
                           reg_alpha=0.5,
                           reg_lambda=0.5,
                           random_state=seed,
                           n_jobs=4,
                           metric=None)

oof =  np.zeros(len(df_train)) 
prediction = df_test[['carid']]
prediction['label'] = 0
df_importance_list = []

kfold = StratifiedKFold(n_splits=10, random_state=seed, shuffle=True)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(
        df_train[feature_names], df_train[ycol])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['valid'],
                          eval_set=[(X_val, Y_val)],
                          verbose=100,
                          eval_metric='auc',
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)[:, 1]
    oof[val_idx] = pred_val
    # df_oof = df_train.iloc[val_idx][[
    #     'carid', ycol]].copy()
    # df_oof['pred'] = pred_val
    # oof.append(df_oof)

    pred_test = lgb_model.predict_proba(
        df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 1]
    prediction['label'] += pred_test / 10

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



Training until validation scores don't improve for 50 rounds
[100]	valid's auc: 0.895718	valid's binary_logloss: 0.378689
[200]	valid's auc: 0.900844	valid's binary_logloss: 0.363595
[300]	valid's auc: 0.90278	valid's binary_logloss: 0.359249
[400]	valid's auc: 0.903765	valid's binary_logloss: 0.357214
[500]	valid's auc: 0.904385	valid's binary_logloss: 0.355953
[600]	valid's auc: 0.904646	valid's binary_logloss: 0.35536
[700]	valid's auc: 0.904852	valid's binary_logloss: 0.354928
[800]	valid's auc: 0.904938	valid's binary_logloss: 0.354649
[900]	valid's auc: 0.905006	valid's binary_logloss: 0.354456
[1000]	valid's auc: 0.905071	valid's binary_logloss: 0.354279
[1100]	valid's auc: 0.905135	valid's binary_logloss: 0.354103
[1200]	valid's auc: 0.905193	valid's binary_logloss: 0.353977
[1300]	valid's auc: 0.905231	valid's binary_logloss: 0.353863
Early stopping, best iteration is:
[1283]	valid's auc: 0.905252	valid's binary_logloss: 0.353836


Training until validation scores don't impr

In [30]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,p1_prior_days_to_insure_target_enc_mean,3107.1
1,suiche_nonauto_nprem_20_ratio_target_enc_mean,2678.4
2,regdays_target_enc_mean,2519.4
3,tp_ratio_target_enc_mean,2502.8
4,p1_census_register_target_enc_mean,2484.3
5,active_90_365_ratio_target_enc_mean,2188.1
6,p1_prior_days_to_insure,2145.7
7,active_90_365_ratio,2126.3
8,p1_prior_days_to_insure_count,2042.4
9,suiche_nonauto_nprem_19_ratio_target_enc_mean,2029.1


In [31]:
os.makedirs('sub_lgbm', exist_ok=True)
res_oof = pd.DataFrame()
res_oof['carid'] = df_train['carid']
res_oof['y1_is_purchase'] = df_train['y1_is_purchase']
res_oof['probability'] = oof
score = roc_auc_score(df_train['y1_is_purchase'], res_oof['probability'])
print(score)
print('开始储存oof')
res_oof.to_csv('sub_lgbm/lgbm_seed_'+str(seed)+f'_train_{score}.csv',index = False)
print(res_oof.shape)
res_oof.head()

0.9032069267925168
开始储存oof
(684283, 3)


Unnamed: 0,carid,y1_is_purchase,probability
0,WY4N+MOjfIx8wJ3j6GhlA4qEfL71brEUkqbB0SSdqkI=,0.0,1.9e-05
1,DXMuODygH0ddFea7SIoAOhF4134Bx4TPvkkPi6WCdzU=,1.0,0.904991
2,waWCEYZJqj9PYxFdVeVLkpCNf/n0BdXPFi1iHlk0WWk=,1.0,0.82073
3,nyRm/VviYGDpy2errRWE206SaYkVuqeclusAtXEU9v8=,1.0,0.54528
4,LacSDMaoqD0AJRqCeYaGUu343r4NQiVuiFc9hyjLcMI=,1.0,0.99521


In [32]:
# os.makedirs('sub_lgbm', exist_ok=True)
prediction.to_csv(f'sub_lgbm/lgbm_seed_'+str(seed)+f'_test_{score}.csv', index=False)
prediction.head()

Unnamed: 0,carid,label
684283,FbOikOdqe5f3mRYDAgnBH2PwI5I+egmzWyNwjmgAuWs=,1.9e-05
684284,WTO/cku1nHO592k9j56on2UzMmx8OLhw8peccj1m13I=,0.629416
684285,ow79MMeuFgFY92UOVjaECsaNPl5cRXAi3M5ZsB4Rt/s=,0.302148
684286,nuO8DDjdXKFMt5Of70LlXMlFoLDX0OMSSBYnNYnqTyQ=,0.900784
684287,j4gIDul5h/7IBEYq4y8oAr2+tSWj/NdsIFbGzDtpTsk=,0.507833


In [33]:
#查看模型的特征重要性
import matplotlib.pyplot as plt 
from matplotlib import cm
score = pd.DataFrame()
score['fea_name'] = df_importance['column']
score['fea'] = df_importance['importance']
score = score.sort_values(['fea'], ascending=False)
temp = pd.DataFrame()
temp = score[:320]
color = cm.jet(temp['fea']/temp['fea'].max())
plt.figure(figsize=(10, 30))
plt.barh(temp['fea_name'],temp['fea'],height =0.8,color=color,alpha=0.8)
plt.show()

## 分特征查看重要性

In [34]:
show_list = []
for s in df_train.columns:
    if 'dpt' in s:
        show_list.append(s)
print(show_list)
show=score[score['fea_name'].isin(show_list)]
show

['dpt', 'dpttrademark_cn_nunique', 'dptsuiche_nonauto_amount_20_nunique', 'dptservice_score_available_nunique', 'dptservice_score_available_mean', 'dptdur_personal_insurance_90_mean', 'dptactive_365_mean', 'dptncd_ly_mean', 'dptbi_renewal_year_mean', 'dptclmnum_mean', 'dptseats_mean', 'dptregdays_mean', 'dptod_ratio_mean', 'dptp2_client_grade_mean', 'dptod_tp_ratio_2ord_mean', 'dptnprem_lyratio_mean', 'dptsi_tp_mean', 'dptsuiche_nonauto_nprem_20_ydiff_mean', 'dpt_ncd_ly_p1_prior_days_to_insure_2ordmean', 'dpt_ncd_ly_active_90_365_ratio_2ordmean', 'dpt_ncd_ly_tp_ratio_2ordmean', 'dpt_ncd_ly_suiche_nonauto_nprem_20_ydiff_2ordmean', 'dpt_ncd_ly_suiche_nonauto_nprem_20_ratio_2ordmean', 'dpt_p1_census_register_active_90_365_ratio_2ordmean', 'dpt_p1_census_register_p1_prior_days_to_insure_2ordmean', 'dpt_p1_census_register_tp_ratio_2ordmean', 'dpt_p1_census_register_regdays_2ordmean', 'dpt_p1_census_register_ncd_ly_2ordmean', 'dpt_p1_census_register_suiche_nonauto_nprem_20_ydiff_2ordmean', '

Unnamed: 0,fea_name,fea
24,dpt_p1_census_register_regdays_2ordmean,1786.9
25,dpt_p1_census_register_suiche_nonauto_nprem_20...,1772.9
28,dpt_p1_census_register_suiche_nonauto_nprem_20...,1712.9
30,dpt_p1_census_register_p1_prior_days_to_insure...,1679.1
32,dpt_p1_census_register_ncd_ly_2ordmean,1638.9
33,dpt_p1_census_register_active_90_365_ratio_2or...,1627.4
37,dpt_p2_client_grade_active_90_365_ratio_2ordmean,1514.3
40,dpt_p1_census_register_tp_ratio_2ordmean,1433.1
46,dpt_target_enc_mean,1358.8
49,dpt_p2_client_grade_regdays_2ordmean,1308.6


In [37]:
# !wget -nv -O heywhale_submit https://cdn.kesci.com/submit_tool/v4/heywhale_submit&&chmod +x heywhale_submit
# !./heywhale_submit -token ff040f9de88e681b -file /home/mw/work/sub_lgbm/lgbm_seed_1024_test_0.9032069267925168.csv

2021-06-02 08:13:53 URL:https://cdn.kesci.com/submit_tool/v4/heywhale_submit [7357446/7357446] -> "heywhale_submit" [1]
Heywhale Submit Tool 4.0.0

> 已验证Token
> 提交文件 /home/mw/work/sub_lgbm/lgbm_seed_1024_test_0.9032069267925168.csv (5057.46 KiB), Target Qiniu
> 已上传 100 %
> 文件已上传        
> 服务器响应: 200 提交成功，请等待评审完成
> 提交完成
