In [1]:
import os
import gc
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
from chinese_calendar import is_holiday

In [2]:
from tsfresh.feature_extraction.feature_calculators import longest_strike_above_mean, mean_abs_change, \
    mean_second_derivative_central, sample_entropy, benford_correlation, count_above_mean,\
    percentage_of_reoccurring_datapoints_to_all_datapoints, variation_coefficient

In [3]:
account_static = pd.read_csv('./data/账户静态信息.csv')

In [4]:
account_trade = pd.read_csv('./data/账户交易信息.csv')

In [5]:
y_train = pd.read_csv('./data/训练集标签.csv')

In [6]:
y_test = pd.read_csv('./data/test_dataset.csv')

In [7]:
def get_time_feature(df, col):
    
    df_copy = df.copy()
    prefix = col + "_"
    df_copy['new_'+col] = df_copy[col].astype(str)
    
    col = 'new_'+col
    df_copy[prefix + 'is_holiday'] = df_copy[col].apply(lambda x: is_holiday(datetime.strptime(x, '%Y-%m-%d')))\
        .astype(int)
    df_copy[col] = pd.to_datetime(df_copy[col], format='%Y-%m-%d')
    df_copy[prefix + 'month'] = df_copy[col].dt.month
    df_copy[prefix + 'day'] = df_copy[col].dt.day
    df_copy[prefix + 'weekofyear'] = df_copy[col].dt.weekofyear
    df_copy[prefix + 'dayofyear'] = df_copy[col].dt.dayofyear
    df_copy[prefix + 'dayofweek'] = df_copy[col].dt.dayofweek
    df_copy[prefix + 'is_wknd'] = df_copy[col].dt.dayofweek // 6
    df_copy[prefix + 'is_month_start'] = df_copy[col].dt.is_month_start.astype(int)
    df_copy[prefix + 'is_month_end'] = df_copy[col].dt.is_month_end.astype(int)
    del df_copy[col]
    
    df_copy[prefix + 'hour'] = df_copy['jysj'].apply(lambda x:int(x.split(':')[0]))
    df_copy[prefix + 'minu'] = df_copy['jysj'].apply(lambda x:int(x.split(':')[1]))
    df_copy[prefix + 'date'] = df_copy['jysj'].apply(lambda x:int(x.split(':')[0])*60 + int(x.split(':')[1]))
    
    return df_copy   

account_trade = get_time_feature(account_trade, "jyrq")
time_cols = [f for f in account_trade.columns if 'jyrq_' in f]
print(time_cols)


  del sys.path[0]


['jyrq_is_holiday', 'jyrq_month', 'jyrq_day', 'jyrq_weekofyear', 'jyrq_dayofyear', 'jyrq_dayofweek', 'jyrq_is_wknd', 'jyrq_is_month_start', 'jyrq_is_month_end', 'jyrq_hour', 'jyrq_minu', 'jyrq_date']


In [8]:
def get_base_feat(df1_, df2_):
    df1 = df1_.copy() # 构建特征数据 
    df2 = df2_.copy()

    agg_func = {
        'dfzh': ['nunique','count'],
        'dfhh': ['nunique'],
        'jyqd': ['nunique'],
        'zydh': ['nunique'],
        'jyje': ['sum','mean','median','max','min','std',np.ptp,
                longest_strike_above_mean, mean_abs_change, mean_second_derivative_central,
                 sample_entropy, benford_correlation, count_above_mean,
                 percentage_of_reoccurring_datapoints_to_all_datapoints],
        'zhye': ['sum','mean', 'median', 'max','min','std', np.ptp],
#         'jyje': ['sum','mean','max','min','std',np.ptp],
#         'zhye': ['sum','mean', 'median', 'max','min','std',np.ptp,
#                  longest_strike_above_mean, mean_abs_change, mean_second_derivative_central,
#                  sample_entropy, benford_correlation, count_above_mean,
#                  percentage_of_reoccurring_datapoints_to_all_datapoints],
        'dfmccd': ['mean', 'median', 'max','min','std',np.ptp],
    }
    
    for col in time_cols:
        agg_func[col] = ['mean', 'median', 'min','max',np.ptp]
    
    agg_df = df1[df1['jdbj']==0].groupby(['zhdh']).agg(agg_func).reset_index()
    agg_df.columns = ['zhdh'] + ['zhdh_jdbj0_' + '_'.join(f).strip()
                                 for f in agg_df.columns.values if f[0] not in ['zhdh']]
    df2 = df2.merge(agg_df, on=['zhdh'], how='left')
    
    agg_df = df1[df1['jdbj']==1].groupby(['zhdh']).agg(agg_func).reset_index()
    agg_df.columns = ['zhdh'] + ['zhdh_jdbj1_' + '_'.join(f).strip()
                                 for f in agg_df.columns.values if f[0] not in ['zhdh']]
    df2 = df2.merge(agg_df, on=['zhdh'], how='left')
    
    return df2

if os.path.exists('train_label.pkl'):
    with open('train_label.pkl', 'rb') as file:
        train_label = pickle.load(file)
    with open('test_label.pkl', 'rb') as file:
        test_label = pickle.load(file)
else:
    train_label = get_base_feat(account_trade, y_train)
    test_label = get_base_feat(account_trade, y_test)
    with open('train_label.pkl', 'wb') as file:
        pickle.dump(train_label, file)
    with open('test_label.pkl', 'wb') as file:
        pickle.dump(test_label, file)

In [9]:
train_label = train_label.replace([np.inf, -np.inf], np.nan)
test_label = test_label.replace([np.inf, -np.inf], np.nan)

In [10]:
fea = pd.concat([train_label, test_label])

In [11]:
# account_trade['jyje_new'] = account_trade['jyje'] * account_trade['jdbj'].apply(lambda x: -1 if x == 0 else 1)

# tmp = account_trade.groupby('zhdh').agg({
#     'jyje_new': ['sum','mean','median','max','min','std',np.ptp,
#                 longest_strike_above_mean, mean_abs_change, mean_second_derivative_central,
#                  sample_entropy, benford_correlation, count_above_mean,
#                  percentage_of_reoccurring_datapoints_to_all_datapoints]
# })
# tmp.columns = [f'{c[0]}_{c[1]}' for c in tmp.columns]
# tmp = tmp.replace([np.inf, -np.inf], np.nan)
# tmp = tmp.reset_index()
# fea = fea.merge(tmp, how='left', on='zhdh')

In [12]:
# tmp = account_trade.sort_values(by='zhye').groupby('zhdh').apply(lambda x: x.head(1)['jyqd'].values[0])\
#     .reset_index().rename(columns={0: 'min_zhye_jyqd'})
# fea = fea.merge(tmp, how='left', on='zhdh')

# tmp = account_trade.sort_values(by='zhye').groupby('zhdh').apply(lambda x: x.head(1)['zydh'].values[0])\
#     .reset_index().rename(columns={0: 'min_zhye_zydh'})
# fea = fea.merge(tmp, how='left', on='zhdh')

# tmp = account_trade.sort_values(by='zhye').groupby('zhdh').apply(lambda x: x.tail(1)['jyqd'].values[0])\
#     .reset_index().rename(columns={0: 'max_zhye_jyqd'})
# fea = fea.merge(tmp, how='left', on='zhdh')

# tmp = account_trade.sort_values(by='zhye').groupby('zhdh').apply(lambda x: x.tail(1)['zydh'].values[0])\
#     .reset_index().rename(columns={0: 'max_zhye_zydh'})
# fea = fea.merge(tmp, how='left', on='zhdh')

In [13]:
tmp = account_trade[account_trade['jdbj'] == 1].groupby('zhdh')['jyje'].apply(lambda x: variation_coefficient(x))\
    .reset_index().rename(columns={'jyje': 'jdbj1_jyje_var_coeff'})
fea = fea.merge(tmp, how='left', on='zhdh')

tmp = account_trade[account_trade['jdbj'] == 0].groupby('zhdh')['jyje'].apply(lambda x: variation_coefficient(x))\
    .reset_index().rename(columns={'jyje': 'jdbj0_jyje_var_coeff'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [14]:
account_trade['jyje_is_int'] = (account_trade['jyje'] == account_trade['jyje'].astype(int)) + 0
tmp = account_trade.groupby('zhdh')['jyje_is_int'].sum() / account_trade.groupby('zhdh').size()
tmp = tmp.reset_index().rename(columns={0: 'jyje_int_pct'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [15]:
account_trade['jyje_10_times'] = (account_trade['jyje'] % 10 == 0) + 0
tmp = account_trade.groupby('zhdh')['jyje_10_times'].sum() / account_trade.groupby('zhdh').size()
tmp = tmp.reset_index().rename(columns={0: 'jyje_10_times_pct'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [16]:
account_trade['jyje_100_times'] = (account_trade['jyje'] % 100 == 0) + 0
tmp = account_trade.groupby('zhdh')['jyje_100_times'].sum() / account_trade.groupby('zhdh').size()
tmp = tmp.reset_index().rename(columns={0: 'jyje_100_times_pct'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [17]:
account_trade['dt'] = account_trade['jyrq'] + ' ' + account_trade['jysj']
account_trade['dt'] = pd.to_datetime(account_trade['dt'])

In [18]:
account_trade['month'] = account_trade['dt'].dt.month
account_trade['day'] = account_trade['dt'].dt.day
account_trade['weekofyear'] = account_trade['dt'].dt.weekofyear
account_trade['dayofweek'] = account_trade['dt'].dt.dayofweek

account_trade['is_wknd'] = account_trade['dt'].dt.dayofweek // 6
account_trade['is_month_start'] = account_trade['dt'].dt.is_month_start.astype(int)
account_trade['is_month_end'] = account_trade['dt'].dt.is_month_end.astype(int)

account_trade['hour'] = account_trade['jysj'].apply(lambda x:int(x.split(':')[0]))
account_trade['minu'] = account_trade['jysj'].apply(lambda x:int(x.split(':')[1]))

  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
account_trade['jyje_label'] = pd.qcut(account_trade['jyje'], 10, labels=range(10))

In [20]:
cat_cols = ['dfzh', 'dfhh', 'jyqd', 'zydh', 'jyje_label',
            'month', 'day', 'weekofyear', 'dayofweek', 'is_wknd',
            'is_month_start', 'is_month_end', 'hour', 'minu']

In [21]:
for col in cat_cols:
    tmp = account_trade.groupby(['zhdh', col]).size().reset_index().groupby('zhdh')\
        .apply(lambda x: x.sort_values(by=0).iloc[-1][col])\
        .reset_index().rename(columns={0: f'most_{col}'})
    fea = fea.merge(tmp, how='left', on='zhdh')
    
    tmp = account_trade[account_trade['jdbj'] == 0].groupby(['zhdh', col]).size().reset_index().groupby('zhdh')\
        .apply(lambda x: x.sort_values(by=0).iloc[-1][col])\
        .reset_index().rename(columns={0: f'jdbj0_most_{col}'})
    fea = fea.merge(tmp, how='left', on='zhdh')
    
    tmp = account_trade[account_trade['jdbj'] == 1].groupby(['zhdh', col]).size().reset_index().groupby('zhdh')\
        .apply(lambda x: x.sort_values(by=0).iloc[-1][col])\
        .reset_index().rename(columns={0: f'jdbj1_most_{col}'})
    fea = fea.merge(tmp, how='left', on='zhdh')

In [22]:
# for col in ['dfhh', 'jyqd', 'zydh']:
#     tmp = account_trade.groupby(['zhdh', col]).size().reset_index().groupby('zhdh')\
#         .apply(lambda x: x.sort_values(by=0).iloc[0][col])\
#         .reset_index().rename(columns={0: f'most_{col}'})
#     fea = fea.merge(tmp, how='left', on='zhdh')

In [23]:
for col in cat_cols:
    tmp = account_trade.groupby(['zhdh', col])['jyje'].sum().reset_index().groupby('zhdh')\
            .apply(lambda x: x.sort_values(by='jyje').iloc[-1][col])\
            .reset_index().rename(columns={0: f'most_jyje_{col}'})
    fea = fea.merge(tmp, how='left', on='zhdh')
    
    tmp = account_trade[account_trade['jdbj'] == 0].groupby(['zhdh', col])['jyje'].sum()\
            .reset_index().groupby('zhdh')\
            .apply(lambda x: x.sort_values(by='jyje').iloc[-1][col])\
            .reset_index().rename(columns={0: f'jdbj0_most_jyje_{col}'})
    fea = fea.merge(tmp, how='left', on='zhdh')
    
    tmp = account_trade[account_trade['jdbj'] == 1].groupby(['zhdh', col])['jyje'].sum()\
            .reset_index().groupby('zhdh')\
            .apply(lambda x: x.sort_values(by='jyje').iloc[-1][col])\
            .reset_index().rename(columns={0: f'jdbj1_most_jyje_{col}'})
    fea = fea.merge(tmp, how='left', on='zhdh')

In [24]:
for c in cat_cols:
    tmp = (account_trade.groupby([c]).size() / account_trade.shape[0])\
        .reset_index().rename(columns={0: f'{c}_ratio'})
    account_trade = account_trade.merge(tmp, how='left', on=c)

In [25]:
agg_func = {}
for c in cat_cols:
    agg_func[f'{c}_ratio'] = ['sum','mean','max','min','std']
tmp = account_trade.groupby('zhdh').agg(agg_func)
tmp.columns = [f'{c[0]}_{c[1]}' for c in tmp.columns]
tmp = tmp.reset_index()
fea = fea.merge(tmp, how='left', on='zhdh')

In [26]:
for col in ['jyqd']:
    tmp = account_trade.merge(y_train, how='left', on='zhdh').groupby(['black_flag', col])\
        .size().reset_index().rename({0: f'flag_{col}_cnt'},axis=1)
    tmp2 = account_trade.merge(y_train, how='left', on='zhdh').groupby('black_flag').size()\
        .reset_index().rename({0:'flag_cnt'},axis=1)
    tmp = tmp.merge(tmp2, how='left', on='black_flag')
    tmp[f'p({col}|flag)'] = tmp[f'flag_{col}_cnt'] / tmp['flag_cnt']
    for i in range(2):
        tmp3 = account_trade.merge(tmp[tmp['black_flag'] == i][[col, f'p({col}|flag)']], how='left', on=col)\
            .groupby('zhdh').agg({f'p({col}|flag)': ['mean', 'sum', 'std', 'max']})
        tmp3.columns = [f'{c[0]}_{c[1]}_lag={i}' for c in tmp3.columns]
        tmp3 = tmp3.reset_index()
        fea = fea.merge(tmp3, how='left', on='zhdh')

In [27]:
tmp = account_trade.groupby('zhdh').apply(lambda x: (x['dt'].max() - x['dt'].min()).days)\
    .reset_index().rename(columns={0: 'ndays'})
tmp['trade_time_per_day'] = account_trade.groupby('zhdh').size().values / (tmp['ndays'].values+1)
fea = fea.merge(tmp, how='left', on='zhdh')

In [28]:
tmp = account_trade.groupby(['zhdh', 'jyje_label']).size().reset_index()\
    .rename(columns={0: 'cnt'}).pivot(index='zhdh', columns=['jyje_label'], values=['cnt'])
tmp.columns = [f'jyje_label_{i}' for i in range(10)]
tmp = (tmp.T /  account_trade.groupby(['zhdh']).size()).T
fea = fea.merge(tmp, how='left', on='zhdh')

In [29]:
tmp = account_trade.groupby('zhdh')['dt'].min().reset_index().merge(account_static, how='left')
tmp['kh2jy_days'] = (tmp['dt'] - pd.to_datetime(tmp['khrq'])).dt.days
fea = fea.merge(tmp[['zhdh', 'kh2jy_days']], how='left', on='zhdh')

In [30]:
tmp = account_trade.sort_values(by='dt').groupby('zhdh').tail(20).groupby('zhdh')\
    .apply(lambda x: (x['dt'].max() - x['dt'].min()).seconds / 3600).reset_index()\
    .rename(columns={0: 'last20trade_hours'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [31]:
tmp = account_trade.groupby('zhdh').apply(lambda x: x[x['jyrq'] == x['jyrq'].min()].shape[0])\
    .reset_index().rename(columns={0: 'last_day_trade_cnt'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [32]:
tmp = account_trade.groupby(['zhdh', 'jdbj'])['jyje'].sum().reset_index()\
    .pivot(index='zhdh', columns=['jdbj'], values=['jyje'])
tmp.columns = ['jdbj_0', 'jdbj_1']
tmp = tmp.reset_index()
tmp['in_out_diff'] = (tmp['jdbj_0'] - tmp['jdbj_1'])
tmp['in_out_ratio'] = (tmp['jdbj_0'] / tmp['jdbj_1'])
fea = fea.merge(tmp, how='left', on='zhdh')

In [33]:
tmp = account_trade.groupby(['zhdh', 'jyqd'])['jyje'].sum().reset_index()\
    .pivot(index='zhdh', columns=['jyqd'], values=['jyje'])
jyqd_cols = [f'{c[1]}_sum' for c in tmp.columns]
tmp.columns = jyqd_cols
tmp = tmp.reset_index()
tmp = tmp.merge(account_trade.groupby(['zhdh'])['jyje'].sum().reset_index(), how='left', on='zhdh')
for col in jyqd_cols:
    tmp[col.split('_')[0]+'_ratio'] = tmp[col] / tmp['jyje']
tmp = tmp.drop(columns=['jyje'])
fea = fea.merge(tmp, how='left', on='zhdh')

In [34]:
# tmp = account_trade.groupby(['zhdh', 'zydh'])['jyje'].sum().reset_index()\
#     .pivot(index='zhdh', columns=['zydh'], values=['jyje'])
# zydh_cols = [f'{c[1]}_sum' for c in tmp.columns]
# tmp.columns = zydh_cols
# tmp = tmp.reset_index()
# tmp = tmp.merge(account_trade.groupby(['zhdh'])['jyje'].sum().reset_index(), how='left', on='zhdh')
# for col in zydh_cols:
#     tmp[col.split('_')[0]+'_ratio'] = tmp[col] / tmp['jyje']
# tmp = tmp.drop(columns=['jyje'])
# fea = fea.merge(tmp, how='left', on='zhdh')

In [35]:
# tmp = account_trade.merge(account_static[['zhdh', 'khjgdh']], how='left', on='zhdh').groupby('zhdh')\
#     .apply(lambda x: x[x['jyqd'] == x['khjgdh']].shape[0] / x.shape[0])
# tmp = tmp.reset_index().rename(columns={0: 'same_bank_ratio'})
# fea = fea.merge(tmp, how='left', on='zhdh')

In [36]:
account_trade.columns

Index(['jylsxh', 'zhdh', 'dfzh', 'jdbj', 'jyje', 'zhye', 'dfhh', 'jyrq',
       'jysj', 'jyqd', 'zydh', 'dfmccd', 'jyrq_is_holiday', 'jyrq_month',
       'jyrq_day', 'jyrq_weekofyear', 'jyrq_dayofyear', 'jyrq_dayofweek',
       'jyrq_is_wknd', 'jyrq_is_month_start', 'jyrq_is_month_end', 'jyrq_hour',
       'jyrq_minu', 'jyrq_date', 'jyje_is_int', 'jyje_10_times',
       'jyje_100_times', 'dt', 'month', 'day', 'weekofyear', 'dayofweek',
       'is_wknd', 'is_month_start', 'is_month_end', 'hour', 'minu',
       'jyje_label', 'dfzh_ratio', 'dfhh_ratio', 'jyqd_ratio', 'zydh_ratio',
       'jyje_label_ratio', 'month_ratio', 'day_ratio', 'weekofyear_ratio',
       'dayofweek_ratio', 'is_wknd_ratio', 'is_month_start_ratio',
       'is_month_end_ratio', 'hour_ratio', 'minu_ratio'],
      dtype='object')

In [37]:
col1 = 'jyqd'
col2 = 'jyrq'

tmp = account_trade.groupby(['zhdh', col1])[col2].size().reset_index().rename(columns={col2: f'{col1}_{col2}_cnt'})
tmp2 = account_trade.groupby(['zhdh'])[col2].size().reset_index().rename(columns={col2: f'{col2}_cnt'})
tmp = tmp.merge(tmp2, how='left', on='zhdh')
tmp[f'{col1}_{col2}_ratio'] = tmp[f'{col1}_{col2}_cnt'] / tmp[f'{col2}_cnt']
tmp = tmp.groupby('zhdh')[f'{col1}_{col2}_ratio'].agg(['mean', 'max', 'std'])
tmp.columns = [f'{col1}_{col2}_ratio_mean', f'{col1}_{col2}_ratio_max', f'{col1}_{col2}_ratio_std']
tmp = tmp.reset_index()
fea = fea.merge(tmp, how='left', on='zhdh')

In [38]:
col1 = 'jyqd'
col2 = 'zydh'

tmp = account_trade.groupby(['zhdh', col1])[col2].size().reset_index().rename(columns={col2: f'{col1}_{col2}_cnt'})
tmp2 = account_trade.groupby(['zhdh'])[col2].size().reset_index().rename(columns={col2: f'{col2}_cnt'})
tmp = tmp.merge(tmp2, how='left', on='zhdh')
tmp[f'{col1}_{col2}_ratio'] = tmp[f'{col1}_{col2}_cnt'] / tmp[f'{col2}_cnt']
tmp = tmp.groupby('zhdh')[f'{col1}_{col2}_ratio'].agg(['mean', 'max', 'std'])
tmp.columns = [f'{col1}_{col2}_ratio_mean', f'{col1}_{col2}_ratio_max', f'{col1}_{col2}_ratio_std']
tmp = tmp.reset_index()
fea = fea.merge(tmp, how='left', on='zhdh')

In [39]:
col1 = 'jyqd'
col2 = 'hour'

tmp = account_trade.groupby(['zhdh', col1])[col2].size().reset_index().rename(columns={col2: f'{col1}_{col2}_cnt'})
tmp2 = account_trade.groupby(['zhdh'])[col2].size().reset_index().rename(columns={col2: f'{col2}_cnt'})
tmp = tmp.merge(tmp2, how='left', on='zhdh')
tmp[f'{col1}_{col2}_ratio'] = tmp[f'{col1}_{col2}_cnt'] / tmp[f'{col2}_cnt']
tmp = tmp.groupby('zhdh')[f'{col1}_{col2}_ratio'].agg(['mean', 'max', 'std'])
tmp.columns = [f'{col1}_{col2}_ratio_mean', f'{col1}_{col2}_ratio_max', f'{col1}_{col2}_ratio_std']
tmp = tmp.reset_index()
fea = fea.merge(tmp, how='left', on='zhdh')

In [40]:
col1 = 'jyqd'
col2 = 'jyje_label'

tmp = account_trade.groupby(['zhdh', col1])[col2].size().reset_index().rename(columns={col2: f'{col1}_{col2}_cnt'})
tmp2 = account_trade.groupby(['zhdh'])[col2].size().reset_index().rename(columns={col2: f'{col2}_cnt'})
tmp = tmp.merge(tmp2, how='left', on='zhdh')
tmp[f'{col1}_{col2}_ratio'] = tmp[f'{col1}_{col2}_cnt'] / tmp[f'{col2}_cnt']
tmp = tmp.groupby('zhdh')[f'{col1}_{col2}_ratio'].agg(['mean', 'max', 'std'])
tmp.columns = [f'{col1}_{col2}_ratio_mean', f'{col1}_{col2}_ratio_max', f'{col1}_{col2}_ratio_std']
tmp = tmp.reset_index()
fea = fea.merge(tmp, how='left', on='zhdh')

In [41]:
# col1 = 'jyqd'
# col2 = 'dfhh'

# tmp = account_trade.groupby(['zhdh', col1])[col2].size().reset_index().rename(columns={col2: f'{col1}_{col2}_cnt'})
# tmp2 = account_trade.groupby(['zhdh'])[col2].size().reset_index().rename(columns={col2: f'{col2}_cnt'})
# tmp = tmp.merge(tmp2, how='left', on='zhdh')
# tmp[f'{col1}_{col2}_ratio'] = tmp[f'{col1}_{col2}_cnt'] / tmp[f'{col2}_cnt']
# tmp = tmp.groupby('zhdh')[f'{col1}_{col2}_ratio'].agg(['mean', 'median', 'max', 'std'])
# tmp.columns = [f'{col1}_{col2}_ratio_mean', f'{col1}_{col2}_ratio_median',
#                f'{col1}_{col2}_ratio_max', f'{col1}_{col2}_ratio_std']
# tmp = tmp.reset_index()
# fea = fea.merge(tmp, how='left', on='zhdh')

In [42]:
col1 = 'zydh'
col2 = 'jyrq'

tmp = account_trade.groupby(['zhdh', col1])[col2].size().reset_index().rename(columns={col2: f'{col1}_{col2}_cnt'})
tmp2 = account_trade.groupby(['zhdh'])[col2].size().reset_index().rename(columns={col2: f'{col2}_cnt'})
tmp = tmp.merge(tmp2, how='left', on='zhdh')
tmp[f'{col1}_{col2}_ratio'] = tmp[f'{col1}_{col2}_cnt'] / tmp[f'{col2}_cnt']
tmp = tmp.groupby('zhdh')[f'{col1}_{col2}_ratio'].agg(['mean', 'max', 'std'])
tmp.columns = [f'{col1}_{col2}_ratio_mean', f'{col1}_{col2}_ratio_max', f'{col1}_{col2}_ratio_std']
tmp = tmp.reset_index()
fea = fea.merge(tmp, how='left', on='zhdh')


col1 = 'zydh'
col2 = 'jyqd'

tmp = account_trade.groupby(['zhdh', col1])[col2].size().reset_index().rename(columns={col2: f'{col1}_{col2}_cnt'})
tmp2 = account_trade.groupby(['zhdh'])[col2].size().reset_index().rename(columns={col2: f'{col2}_cnt'})
tmp = tmp.merge(tmp2, how='left', on='zhdh')
tmp[f'{col1}_{col2}_ratio'] = tmp[f'{col1}_{col2}_cnt'] / tmp[f'{col2}_cnt']
tmp = tmp.groupby('zhdh')[f'{col1}_{col2}_ratio'].agg(['mean', 'max', 'std'])
tmp.columns = [f'{col1}_{col2}_ratio_mean', f'{col1}_{col2}_ratio_max', f'{col1}_{col2}_ratio_std']
tmp = tmp.reset_index()
fea = fea.merge(tmp, how='left', on='zhdh')


col1 = 'zydh'
col2 = 'hour'

tmp = account_trade.groupby(['zhdh', col1])[col2].size().reset_index().rename(columns={col2: f'{col1}_{col2}_cnt'})
tmp2 = account_trade.groupby(['zhdh'])[col2].size().reset_index().rename(columns={col2: f'{col2}_cnt'})
tmp = tmp.merge(tmp2, how='left', on='zhdh')
tmp[f'{col1}_{col2}_ratio'] = tmp[f'{col1}_{col2}_cnt'] / tmp[f'{col2}_cnt']
tmp = tmp.groupby('zhdh')[f'{col1}_{col2}_ratio'].agg(['mean', 'max', 'std'])
tmp.columns = [f'{col1}_{col2}_ratio_mean', f'{col1}_{col2}_ratio_max', f'{col1}_{col2}_ratio_std']
tmp = tmp.reset_index()
fea = fea.merge(tmp, how='left', on='zhdh')


col1 = 'zydh'
col2 = 'jyje_label'

tmp = account_trade.groupby(['zhdh', col1])[col2].size().reset_index().rename(columns={col2: f'{col1}_{col2}_cnt'})
tmp2 = account_trade.groupby(['zhdh'])[col2].size().reset_index().rename(columns={col2: f'{col2}_cnt'})
tmp = tmp.merge(tmp2, how='left', on='zhdh')
tmp[f'{col1}_{col2}_ratio'] = tmp[f'{col1}_{col2}_cnt'] / tmp[f'{col2}_cnt']
tmp = tmp.groupby('zhdh')[f'{col1}_{col2}_ratio'].agg(['mean', 'max', 'std'])
tmp.columns = [f'{col1}_{col2}_ratio_mean', f'{col1}_{col2}_ratio_max', f'{col1}_{col2}_ratio_std']
tmp = tmp.reset_index()
fea = fea.merge(tmp, how='left', on='zhdh')

In [43]:
from collections import Counter
def calc_cf(values):
    rms = np.sqrt(np.mean(values ** 2))
    xr = np.mean(np.abs(values)) ** 2
    max_v = values.max()
    min_v = values.min()
    pk_max = max(max_v, -min_v)
    return pk_max / rms if rms != 0 else 0

def calc_pf(values):
    rms = np.sqrt(np.mean(values ** 2))
    xr = np.mean(np.abs(values)) ** 2
    max_v = values.max()
    min_v = values.min()
    pk_max = max(max_v, -min_v)
    return pk_max / xr if xr != 0 else 0

def top1_num(values):
    counter = Counter(values)
    return counter.most_common(3)[0][0]

def top1_ratio(values):
    counter = Counter(values)
    return counter.most_common(3)[0][1] / len(values)

def top2_num(values):
    counter = Counter(values)
    return counter.most_common(3)[1][0] if len(counter) > 1 else -1

def top2_ratio(values):
    counter = Counter(values)
    return counter.most_common(3)[1][1] / len(values) if len(counter) > 1 else 0

def top3_num(values):
    counter = Counter(values)
    return counter.most_common(3)[2][0] if len(counter) > 2 else -1

def top3_ratio(values):
    counter = Counter(values)
    return counter.most_common(3)[2][1] / len(values) if len(counter) > 2 else 0

In [44]:
# tmp = account_trade.groupby(['zhdh', 'jdbj'])['jyje']\
#     .agg([calc_cf, calc_pf]).reset_index()
# tmp = tmp.pivot(index='zhdh', columns=['jdbj'],
#           values=['calc_cf', 'calc_pf'])
# tmp.columns = [f'{c[0]}_jdbj{c[1]}' for c in tmp.columns]
# tmp = tmp.reset_index()
# fea = fea.merge(tmp, how='left', on='zhdh')

In [45]:
col = 'jyqd'
tmp = account_trade.groupby(['zhdh', 'jyrq', 'jdbj'])[col].size().reset_index()
tmp = tmp.pivot(index=['zhdh', 'jyrq'], columns=['jdbj'], values=[col])
tmp.columns = [f'jdbj_0_{col}', f'jdbj_1_{col}']

tmp = tmp.reset_index().fillna(0)
tmp[col] = tmp[f'jdbj_0_{col}'] + tmp[f'jdbj_1_{col}']
tmp[f'jdbj_0_{col}_ratio'] = tmp[f'jdbj_0_{col}'] / tmp[col]
tmp[f'jdbj_1_{col}_ratio'] = tmp[f'jdbj_1_{col}'] / tmp[col]
tmp = tmp.groupby('zhdh')[[f'jdbj_0_{col}_ratio', f'jdbj_1_{col}_ratio']].sum().reset_index()
fea = fea.merge(tmp, how='left', on='zhdh')

In [46]:
col = 'jyje'
tmp = account_trade.groupby(['zhdh', 'jyrq', 'jdbj'])[col].sum().reset_index()
tmp = tmp.pivot(index=['zhdh', 'jyrq'], columns=['jdbj'], values=[col])
tmp.columns = [f'jdbj_0_{col}', f'jdbj_1_{col}']

tmp = tmp.reset_index().fillna(0)
tmp[col] = tmp[f'jdbj_0_{col}'] + tmp[f'jdbj_1_{col}']
tmp[f'jdbj_0_{col}_ratio'] = tmp[f'jdbj_0_{col}'] / tmp[col]
tmp[f'jdbj_1_{col}_ratio'] = tmp[f'jdbj_1_{col}'] / tmp[col]
tmp = tmp.groupby('zhdh')[[f'jdbj_0_{col}_ratio', f'jdbj_1_{col}_ratio']].sum()
# fea = fea.merge(tmp, how='left', on='zhdh')

In [47]:
# col = 'dfhh'
# tmp = account_trade.groupby(['zhdh', 'jyrq', 'jdbj'])[col].nunique().reset_index()
# tmp = tmp.pivot(index=['zhdh', 'jyrq'], columns=['jdbj'], values=[col])
# tmp.columns = [f'jdbj_0_{col}', f'jdbj_1_{col}']

# tmp = tmp.reset_index().fillna(0)
# tmp[col] = tmp[f'jdbj_0_{col}'] + tmp[f'jdbj_1_{col}']
# tmp[f'jdbj_0_{col}_ratio'] = tmp[f'jdbj_0_{col}'] / tmp[col]
# tmp[f'jdbj_1_{col}_ratio'] = tmp[f'jdbj_1_{col}'] / tmp[col]
# tmp = tmp.groupby('zhdh')[[f'jdbj_0_{col}_ratio', f'jdbj_1_{col}_ratio']].sum().reset_index()
# fea = fea.merge(tmp, how='left', on='zhdh')

In [48]:
# col = 'dfzh'
# tmp = account_trade.groupby(['zhdh', 'jyrq', 'jdbj'])[col].nunique().reset_index()
# tmp = tmp.pivot(index=['zhdh', 'jyrq'], columns=['jdbj'], values=[col])
# tmp.columns = [f'jdbj_0_{col}', f'jdbj_1_{col}']

# tmp = tmp.reset_index().fillna(0)
# tmp[col] = tmp[f'jdbj_0_{col}'] + tmp[f'jdbj_1_{col}']
# tmp[f'jdbj_0_{col}_ratio'] = tmp[f'jdbj_0_{col}'] / tmp[col]
# tmp[f'jdbj_1_{col}_ratio'] = tmp[f'jdbj_1_{col}'] / tmp[col]
# tmp = tmp.groupby('zhdh')[[f'jdbj_0_{col}_ratio', f'jdbj_1_{col}_ratio']].sum().reset_index()
# fea = fea.merge(tmp, how='left', on='zhdh')

In [49]:
# with open('w2v_fea_dfhh.pkl', 'rb') as file:
#     w2v_fea = pickle.load(file)
# fea = fea.merge(w2v_fea, how='left', on='zhdh')

# with open('tfidf_fea_dfhh.pkl', 'rb') as file:
#     tfidf_fea = pickle.load(file)
# fea = fea.merge(tfidf_fea, how='left', on='zhdh')

In [50]:
with open('w2v_fea_jyqd.pkl', 'rb') as file:
    w2v_fea = pickle.load(file)
fea = fea.merge(w2v_fea, how='left', on='zhdh')

with open('tfidf_fea_jyqd.pkl', 'rb') as file:
    tfidf_fea = pickle.load(file)
fea = fea.merge(tfidf_fea, how='left', on='zhdh')

# with open('cnt_fea_jyqd.pkl', 'rb') as file:
#     tfidf_fea = pickle.load(file)
# fea = fea.merge(tfidf_fea, how='left', on='zhdh')

# with open('w2v_fea_jyqd_0.pkl', 'rb') as file:
#     w2v_fea = pickle.load(file)
# fea = fea.merge(w2v_fea, how='left', on='zhdh')

# with open('w2v_fea_jyqd_1.pkl', 'rb') as file:
#     w2v_fea = pickle.load(file)
# fea = fea.merge(w2v_fea, how='left', on='zhdh')

# with open('tfidf_fea_jyqd_0.pkl', 'rb') as file:
#     tfidf_fea = pickle.load(file)
# fea = fea.merge(tfidf_fea, how='left', on='zhdh')

# with open('tfidf_fea_jyqd_1.pkl', 'rb') as file:
#     tfidf_fea = pickle.load(file)
# fea = fea.merge(tfidf_fea, how='left', on='zhdh')

In [51]:
with open('w2v_fea_zydh.pkl', 'rb') as file:
    w2v_fea = pickle.load(file)
fea = fea.merge(w2v_fea, how='left', on='zhdh')

with open('tfidf_fea_zydh.pkl', 'rb') as file:
    tfidf_fea = pickle.load(file)
fea = fea.merge(tfidf_fea, how='left', on='zhdh')

# with open('cnt_fea_zydh.pkl', 'rb') as file:
#     tfidf_fea = pickle.load(file)
# fea = fea.merge(tfidf_fea, how='left', on='zhdh')

# with open('w2v_fea_zydh_0.pkl', 'rb') as file:
#     w2v_fea = pickle.load(file)
# fea = fea.merge(w2v_fea, how='left', on='zhdh')

# with open('w2v_fea_zydh_1.pkl', 'rb') as file:
#     w2v_fea = pickle.load(file)
# fea = fea.merge(w2v_fea, how='left', on='zhdh')

# with open('tfidf_fea_zydh_0.pkl', 'rb') as file:
#     tfidf_fea = pickle.load(file)
# fea = fea.merge(tfidf_fea, how='left', on='zhdh')

# with open('tfidf_fea_zydh_1.pkl', 'rb') as file:
#     tfidf_fea = pickle.load(file)
# fea = fea.merge(tfidf_fea, how='left', on='zhdh')

In [52]:
# with open('w2v_fea_dfzh.pkl', 'rb') as file:
#     w2v_fea = pickle.load(file)
# fea = fea.merge(w2v_fea, how='left', on='zhdh')

# with open('tfidf_fea_dfzh.pkl', 'rb') as file:
#     tfidf_fea = pickle.load(file)
# fea = fea.merge(tfidf_fea, how='left', on='zhdh')

In [53]:
# with open('w2v_fea_jyje_label.pkl', 'rb') as file:
#     w2v_fea = pickle.load(file)
# fea = fea.merge(w2v_fea, how='left', on='zhdh')

# with open('tfidf_fea_jyje_label.pkl', 'rb') as file:
#     tfidf_fea = pickle.load(file)
# fea = fea.merge(tfidf_fea, how='left', on='zhdh')

# with open('cnt_fea_jyje_label.pkl', 'rb') as file:
#     tfidf_fea = pickle.load(file)
# fea = fea.merge(tfidf_fea, how='left', on='zhdh')

In [54]:
account_static['khrq']  = pd.to_datetime(account_static['khrq'], format='%Y-%m-%d')
account_static['year']  = account_static['khrq'].dt.year
account_static['month'] = account_static['khrq'].dt.month
account_static['day']   = account_static['khrq'].dt.day

In [55]:
df = account_static.merge(fea, how='left', on='zhdh')

In [56]:
def label_encode(series):
    unique = list(series.unique())
    return series.map(dict(zip(
        unique, range(series.nunique())
    )))

for col in ['khrq', 'khjgdh'] + [f'most_{c}' for c in cat_cols] + [f'most_jyje_{c}' for c in cat_cols] +\
    [f'jdbj0_most_{c}' for c in cat_cols] + [f'jdbj1_most_{c}' for c in cat_cols] +\
    [f'jdbj0_most_jyje_{c}' for c in cat_cols] + [f'jdbj1_most_jyje_{c}' for c in cat_cols]:
    df[col] = label_encode(df[col])

In [57]:
df.shape

(6000, 485)

In [58]:
df.to_pickle('fea.pkl')

In [59]:
train = df[df['zhdh'].isin(y_train['zhdh'].values)]
test_ids = pd.read_csv('./data/test_dataset.csv')['zhdh'].values
test = df[df['zhdh'].isin(test_ids)]

In [60]:
target = 'black_flag'
features = [c for c in train.columns if c not in [target, 'zhdh',]]

In [61]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier

In [62]:
import xgboost as xgb

In [63]:
# FOLDS = 5
# folds = KFold(n_splits=FOLDS, shuffle=True, random_state=2023)

# oof_pred = np.zeros((len(train),))
# y_pred = np.zeros((len(test),))

# for fold, (tr_ind, val_ind) in enumerate(folds.split(train)):
    
#     x_train, x_val = train.iloc[tr_ind][features], train.iloc[val_ind][features]
#     y_train, y_val = train.iloc[tr_ind][target], train.iloc[val_ind][target]
    
#     train_matrix = xgb.DMatrix(x_train , label=y_train)
#     valid_matrix = xgb.DMatrix(x_val , label=y_val)
#     test_matrix = xgb.DMatrix(test[features])
            
#     params = {'booster': 'gbtree',
#               'objective': 'binary:logistic',
#               'eval_metric': 'auc',
#               'gamma': 1,
#               'min_child_weight': 1.5,
#               'max_depth': 5,
#               'lambda': 10,
#               'subsample': 0.7,
#               'colsample_bytree': 0.7,
#               'colsample_bylevel': 0.7,
#               'eta': 0.05,
#               'tree_method': 'exact',
#               'seed': 2023,
#               'nthread': 8
#               }

#     watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]

#     model = xgb.train(params, train_matrix, num_boost_round=10000, evals=watchlist,
#                       verbose_eval=1000, early_stopping_rounds=500)
#     val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
#     test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
    
#     oof_pred[val_ind] = val_pred
#     y_pred += test_pred / FOLDS

In [64]:
# from sklearn.metrics import f1_score
# f1_score(train[target].values, (oof_pred > 0.5) + 0)

In [65]:
# oof = oof_pred
# scores = []; thresholds = []
# best_score = 0; best_threshold = 0

# for threshold in np.arange(0.4,0.6,0.01):
#     preds = (oof.reshape((-1))>threshold).astype('int')
#     m = f1_score(train['black_flag'].values.reshape((-1)), preds, average='macro')   
#     scores.append(m)
#     thresholds.append(threshold)
#     if m>best_score:
#         best_score = m
#         best_threshold = threshold
#     print(f'{threshold:.02f}, {m}')
# print(f'{best_threshold:.02f}, {best_score}')

In [66]:
# test_result = test[['zhdh', 'black_flag']].copy()
# test_result['black_flag'] = y_pred
# test_result['black_flag'] = test_result['black_flag'].apply(lambda x: (x > best_threshold)+0)

In [67]:
# result = pd.read_csv('./data/test_dataset.csv')
# result = result.merge(test_result, how='left', on='zhdh')
# result.to_csv('result.csv', index=False)

In [68]:
for c in [f'jdbj0_most_{c}' for c in cat_cols]:
    train[c] = train[c].astype(str)
    test[c] = test[c].astype(str)

for c in [f'jdbj1_most_{c}' for c in cat_cols]:
    train[c] = train[c].astype(str)
    test[c] = test[c].astype(str)

for c in [f'jdbj0_most_jyje_{c}' for c in cat_cols]:
    train[c] = train[c].astype(str)
    test[c] = test[c].astype(str)
    
for c in [f'jdbj1_most_jyje_{c}' for c in cat_cols]:
    train[c] = train[c].astype(str)
    test[c] = test[c].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead


In [69]:
seeds = [0, 1, 2, 42, 2023, 3407]

In [70]:
oof_pred = np.zeros((len(train),))
y_pred = np.zeros((len(test),))
# y_preds = []

for seed in seeds:
    FOLDS = 5
    folds = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=seed)

    for fold, (tr_ind, val_ind) in enumerate(folds.split(train, train[target])):
        x_train, x_val = train.iloc[tr_ind][features], train.iloc[val_ind][features]
        y_train, y_val = train.iloc[tr_ind][target], train.iloc[val_ind][target]

        classes = np.unique(y_train)
        weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
        class_weights = dict(zip(classes, weights))

        params = {
            'task_type': 'CPU',
            'bootstrap_type': 'Bayesian',
            'boosting_type': 'Plain',
            'learning_rate': 0.01,
            'eval_metric': 'Logloss',
            'loss_function': 'Logloss',
            'iterations': 10000,
            'random_state': seed,
            'depth': 6,
            'leaf_estimation_iterations': 8,
            'reg_lambda': 5,
            'early_stopping_rounds': 100,
            'class_weights': class_weights,
            'cat_features': ['khjgdh', 'xb', '年龄'] + [f'most_{c}' for c in cat_cols] + \
                            [f'most_jyje_{c}' for c in cat_cols] +\
                            [f'jdbj0_most_{c}' for c in cat_cols] +\
                            [f'jdbj1_most_{c}' for c in cat_cols] +\
                            [f'jdbj0_most_jyje_{c}' for c in cat_cols] +\
                            [f'jdbj1_most_jyje_{c}' for c in cat_cols],
        }
        model = CatBoostClassifier(**params)
        model.fit(x_train,
                   y_train,
                   eval_set=(x_val, y_val),
                   verbose=100)
        oof_pred[val_ind] += model.predict_proba(x_val)[:, 1]
        y_pred += model.predict_proba(test[features])[:, 1] / FOLDS
#         y_preds.append(model.predict_proba(test[features])[:, 1])
        

oof_pred /= len(seeds)
y_pred /= len(seeds)

0:	learn: 0.6838441	test: 0.6852537	best: 0.6852537 (0)	total: 108ms	remaining: 18m 1s
100:	learn: 0.3063309	test: 0.3532841	best: 0.3532841 (100)	total: 4.34s	remaining: 7m 5s
200:	learn: 0.2200979	test: 0.2874013	best: 0.2874013 (200)	total: 8.6s	remaining: 6m 59s
300:	learn: 0.1757723	test: 0.2626323	best: 0.2626323 (300)	total: 13.2s	remaining: 7m 6s
400:	learn: 0.1485484	test: 0.2507773	best: 0.2507609 (399)	total: 18s	remaining: 7m 10s
500:	learn: 0.1264931	test: 0.2447973	best: 0.2447973 (500)	total: 22.8s	remaining: 7m 13s
600:	learn: 0.1087917	test: 0.2406391	best: 0.2406391 (600)	total: 27.8s	remaining: 7m 15s
700:	learn: 0.0946819	test: 0.2385716	best: 0.2385716 (700)	total: 32.9s	remaining: 7m 15s
800:	learn: 0.0814813	test: 0.2391890	best: 0.2383760 (713)	total: 37.9s	remaining: 7m 15s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2383759685
bestIteration = 713

Shrink model to first 714 iterations.
0:	learn: 0.6873712	test: 0.6865407	best: 0.6865407

300:	learn: 0.1728520	test: 0.2360774	best: 0.2360774 (300)	total: 15.5s	remaining: 8m 19s
400:	learn: 0.1463469	test: 0.2248981	best: 0.2248981 (400)	total: 20.7s	remaining: 8m 15s
500:	learn: 0.1260283	test: 0.2173860	best: 0.2173860 (500)	total: 26s	remaining: 8m 13s
600:	learn: 0.1101263	test: 0.2121600	best: 0.2120540 (598)	total: 31.3s	remaining: 8m 10s
700:	learn: 0.0955659	test: 0.2088528	best: 0.2088528 (700)	total: 36.7s	remaining: 8m 6s
800:	learn: 0.0823016	test: 0.2049668	best: 0.2045737 (784)	total: 41.9s	remaining: 8m 1s
900:	learn: 0.0717952	test: 0.2033069	best: 0.2033044 (899)	total: 47.2s	remaining: 7m 56s
1000:	learn: 0.0630416	test: 0.2024769	best: 0.2024769 (1000)	total: 52.5s	remaining: 7m 51s
1100:	learn: 0.0560311	test: 0.2017147	best: 0.2017147 (1100)	total: 57.8s	remaining: 7m 46s
1200:	learn: 0.0496907	test: 0.2020441	best: 0.2011414 (1149)	total: 1m 2s	remaining: 7m 41s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2011413967
bestIter

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2333411267
bestIteration = 840

Shrink model to first 841 iterations.
0:	learn: 0.6854983	test: 0.6864337	best: 0.6864337 (0)	total: 45.1ms	remaining: 7m 31s
100:	learn: 0.3031005	test: 0.3559391	best: 0.3559391 (100)	total: 5.17s	remaining: 8m 27s
200:	learn: 0.2106583	test: 0.2929019	best: 0.2929019 (200)	total: 10.3s	remaining: 8m 24s
300:	learn: 0.1702450	test: 0.2727547	best: 0.2727547 (300)	total: 15.6s	remaining: 8m 24s
400:	learn: 0.1421902	test: 0.2628850	best: 0.2628850 (400)	total: 21.4s	remaining: 8m 33s
500:	learn: 0.1233428	test: 0.2581278	best: 0.2580935 (499)	total: 28.2s	remaining: 8m 55s
600:	learn: 0.1086485	test: 0.2534061	best: 0.2534061 (600)	total: 33.6s	remaining: 8m 44s
700:	learn: 0.0944984	test: 0.2528024	best: 0.2525449 (691)	total: 38.9s	remaining: 8m 36s
800:	learn: 0.0834323	test: 0.2526488	best: 0.2525361 (706)	total: 44.3s	remaining: 8m 29s
Stopped by overfitting detector  (100 iterati

1100:	learn: 0.0567086	test: 0.1983834	best: 0.1978686 (1066)	total: 58.3s	remaining: 7m 51s
1200:	learn: 0.0509073	test: 0.1983758	best: 0.1978285 (1161)	total: 1m 3s	remaining: 7m 44s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.1978284545
bestIteration = 1161

Shrink model to first 1162 iterations.
0:	learn: 0.6861170	test: 0.6866279	best: 0.6866279 (0)	total: 19.9ms	remaining: 3m 18s
100:	learn: 0.3118336	test: 0.3364370	best: 0.3364370 (100)	total: 4.8s	remaining: 7m 50s
200:	learn: 0.2192502	test: 0.2635280	best: 0.2635280 (200)	total: 9.85s	remaining: 8m
300:	learn: 0.1784349	test: 0.2368027	best: 0.2368027 (300)	total: 14.8s	remaining: 7m 57s
400:	learn: 0.1516786	test: 0.2250722	best: 0.2250722 (400)	total: 19.9s	remaining: 7m 56s
500:	learn: 0.1313946	test: 0.2177819	best: 0.2176264 (497)	total: 24.8s	remaining: 7m 50s
600:	learn: 0.1145784	test: 0.2126670	best: 0.2126591 (596)	total: 29.8s	remaining: 7m 45s
700:	learn: 0.1011851	test: 0.2085759	best: 

In [71]:
from sklearn.metrics import f1_score
f1_score(train[target].values, (oof_pred > 0.5) + 0)

0.8648648648648648

In [72]:
oof = oof_pred
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4,0.6,0.01):
    preds = (oof.reshape((-1))>threshold).astype('int')
    m = f1_score(train['black_flag'].values.reshape((-1)), preds, average='macro')   
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold
    print(f'{threshold:.02f}, {m}')
print(f'{best_threshold:.02f}, {best_score}')

0.40, 0.9062905211772525
0.41, 0.9070847851335656
0.42, 0.906885965432784
0.43, 0.9078826161337017
0.44, 0.9086840882023692
0.45, 0.9106905768451539
0.46, 0.9104951771449619
0.47, 0.9092906975029129
0.48, 0.9086856298649957
0.49, 0.9096982416240937
0.50, 0.9103085386271227
0.51, 0.9142054028018709
0.52, 0.9138058135710478
0.53, 0.9100875508247193
0.54, 0.9111209021608255
0.55, 0.9098748800946124
0.56, 0.9096605122096486
0.57, 0.9104833896956435
0.58, 0.9123562334706932
0.59, 0.9123562334706932
0.51, 0.9142054028018709


In [73]:
test_result = test[['zhdh', 'black_flag']].copy()
test_result['black_flag'] = y_pred
test_result['black_flag'] = test_result['black_flag'].apply(lambda x: (x > 0.5)+0)

In [74]:
result = pd.read_csv('./data/test_dataset.csv')
result = result.merge(test_result, how='left', on='zhdh')
result.to_csv('result.csv', index=False)

In [75]:
pd.DataFrame(model.feature_importances_, features).sort_values(by=0, ascending=False).to_csv('fi.csv')

In [95]:
tmp = pd.DataFrame(y_preds).T

In [96]:
results = []
for i in range(7):
    res = (tmp.iloc[:, 5*i:5*(i+1)].mean(axis=1) > 0.5) + 0
    results.append(res)

In [97]:
res = pd.concat(results, axis=1).sum(axis=1)
res = (res >= 4) + 0

In [98]:
test_result['black_flag'] = res.values

In [99]:
result = pd.read_csv('./data/test_dataset.csv')
result = result.merge(test_result, how='left', on='zhdh')
result.to_csv('result.csv', index=False)

In [67]:
for c in [f'jdbj0_most_{c}' for c in cat_cols]:
    train[c] = train[c].astype(float)
    test[c] = test[c].astype(float)

for c in [f'jdbj1_most_{c}' for c in cat_cols]:
    train[c] = train[c].astype(float)
    test[c] = test[c].astype(float)

for c in [f'jdbj0_most_jyje_{c}' for c in cat_cols]:
    train[c] = train[c].astype(float)
    test[c] = test[c].astype(float)
    
for c in [f'jdbj1_most_jyje_{c}' for c in cat_cols]:
    train[c] = train[c].astype(float)
    test[c] = test[c].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead


In [68]:
import lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import log_evaluation

In [96]:
FOLDS = 5
folds = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=2023)

oof_pred = np.zeros((len(train),))
y_pred = np.zeros((len(test),))

for fold, (tr_ind, val_ind) in enumerate(folds.split(train, train[target])):
    x_train, x_val = train.iloc[tr_ind][features], train.iloc[val_ind][features]
    y_train, y_val = train.iloc[tr_ind][target], train.iloc[val_ind][target]
    
    dtrain = lgb.Dataset(x_train, label=y_train)
    dval = lgb.Dataset(x_val, label=y_val)

    parameters = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'binary',
        'min_child_weight': 5,
        'max_depth': 6,
        'num_leaves': 64,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 2,
        'colsample_bytree': 0.9,
        'reg_alpha': 3,
        'reg_lambda': 3,
        'learning_rate': 0.01,
        'seed': 2023,
        'nthread': 32,
        'n_jobs':8,
        'silent': True,
        'verbose': -1,
    }

    model = lgb.train(
        parameters,
        dtrain,
        num_boost_round=8000,
        valid_sets=[dval],
        callbacks=[early_stopping(100), log_evaluation(100)],

    )
    oof_pred[val_ind] = model.predict(x_val)
    y_pred += model.predict(test[features]) / FOLDS

Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.310287
[200]	valid_0's binary_logloss: 0.240729
[300]	valid_0's binary_logloss: 0.209395
[400]	valid_0's binary_logloss: 0.194702
[500]	valid_0's binary_logloss: 0.185489
[600]	valid_0's binary_logloss: 0.180145
[700]	valid_0's binary_logloss: 0.176678
[800]	valid_0's binary_logloss: 0.174215
[900]	valid_0's binary_logloss: 0.173425
[1000]	valid_0's binary_logloss: 0.172082
[1100]	valid_0's binary_logloss: 0.171615
[1200]	valid_0's binary_logloss: 0.170689
[1300]	valid_0's binary_logloss: 0.170623
[1400]	valid_0's binary_logloss: 0.170838
Early stopping, best iteration is:
[1308]	valid_0's binary_logloss: 0.170501
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.328244
[200]	valid_0's binary_logloss: 0.253559
[300]	valid_0's binary_logloss: 0.217878
[400]	valid_0's binary_logloss: 0.199002
[500]	valid_0's binary_logloss: 0.188422
[600]	valid_0's

In [103]:
from sklearn.metrics import f1_score
f1_score(train[target].values, (oof_pred > 0.48) + 0)

0.8641975308641975

In [104]:
test_result = test[['zhdh', 'black_flag']].copy()
test_result['black_flag'] = y_pred
test_result['black_flag'] = test_result['black_flag'].apply(lambda x: (x > 0.48)+0)

In [105]:
result = pd.read_csv('./data/test_dataset.csv')
result = result.merge(test_result, how='left', on='zhdh')
result.to_csv('result.csv', index=False)

In [82]:
test['black_flag'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [83]:
new_train = pd.concat([train, test[(test['black_flag'] > 0.98) | (test['black_flag'] < 0.02)].copy()])
new_test = test[~((test['black_flag'] > 0.98) | (test['black_flag'] < 0.02))].copy()

In [84]:
new_train['black_flag'] = (new_train['black_flag'] > 0) + 0

In [85]:
FOLDS = 5
folds = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=2023)

oof_pred = np.zeros((len(new_train),))
y_pred = np.zeros((len(new_test),))

for fold, (tr_ind, val_ind) in enumerate(folds.split(new_train, new_train[target])):
    x_train, x_val = new_train.iloc[tr_ind][features], new_train.iloc[val_ind][features]
    y_train, y_val = new_train.iloc[tr_ind][target], new_train.iloc[val_ind][target]
    
    classes = np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))
    
    params = {
        'task_type': 'CPU',
        'bootstrap_type': 'Bayesian',
        'boosting_type': 'Plain',
        'learning_rate': 0.01,
        'eval_metric': 'Logloss',
        'loss_function': 'Logloss',
        'iterations': 10000,
        'random_state': 42,
        'depth': 6,
        'leaf_estimation_iterations': 8,
        'reg_lambda': 5,
        'early_stopping_rounds': 100,
        'class_weights': class_weights,
        'cat_features': ['khjgdh', 'xb', '年龄'] + [f'most_{c}' for c in cat_cols] + \
                        [f'most_jyje_{c}' for c in cat_cols],
    }
    model = CatBoostClassifier(**params)
    model.fit(x_train,
               y_train,
               eval_set=(x_val, y_val),
               verbose=100)
    oof_pred[val_ind] = model.predict_proba(x_val)[:, 1]
    y_pred += model.predict_proba(new_test[features])[:, 1] / FOLDS

0:	learn: 0.6878241	test: 0.6885613	best: 0.6885613 (0)	total: 65.1ms	remaining: 10m 50s
100:	learn: 0.4417971	test: 0.4807257	best: 0.4807257 (100)	total: 4.09s	remaining: 6m 40s
200:	learn: 0.3834371	test: 0.4518828	best: 0.4518828 (200)	total: 7.25s	remaining: 5m 53s
300:	learn: 0.3470120	test: 0.4400088	best: 0.4400088 (300)	total: 9.79s	remaining: 5m 15s
400:	learn: 0.3193859	test: 0.4321788	best: 0.4321496 (399)	total: 12.4s	remaining: 4m 56s
500:	learn: 0.2951583	test: 0.4267661	best: 0.4266982 (499)	total: 15.1s	remaining: 4m 45s
600:	learn: 0.2751183	test: 0.4230531	best: 0.4230531 (600)	total: 17.7s	remaining: 4m 36s
700:	learn: 0.2566864	test: 0.4197406	best: 0.4194264 (687)	total: 20.4s	remaining: 4m 30s
800:	learn: 0.2390636	test: 0.4190372	best: 0.4186072 (792)	total: 23s	remaining: 4m 24s
900:	learn: 0.2198738	test: 0.4158454	best: 0.4158359 (898)	total: 25.7s	remaining: 4m 20s
1000:	learn: 0.2019708	test: 0.4146883	best: 0.4144218 (981)	total: 28.5s	remaining: 4m 15s
11

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3764534016
bestIteration = 1616

Shrink model to first 1617 iterations.
0:	learn: 0.6883180	test: 0.6885580	best: 0.6885580 (0)	total: 39.3ms	remaining: 6m 33s
100:	learn: 0.4491532	test: 0.4859101	best: 0.4859101 (100)	total: 2.78s	remaining: 4m 32s
200:	learn: 0.3884458	test: 0.4518818	best: 0.4518818 (200)	total: 5.54s	remaining: 4m 30s
300:	learn: 0.3556593	test: 0.4401891	best: 0.4401619 (299)	total: 8.28s	remaining: 4m 26s
400:	learn: 0.3293234	test: 0.4315159	best: 0.4315159 (400)	total: 11s	remaining: 4m 23s
500:	learn: 0.3064125	test: 0.4239739	best: 0.4239443 (499)	total: 13.8s	remaining: 4m 20s
600:	learn: 0.2852920	test: 0.4163919	best: 0.4163919 (600)	total: 16.5s	remaining: 4m 18s
700:	learn: 0.2668955	test: 0.4128495	best: 0.4127409 (694)	total: 19.3s	remaining: 4m 16s
800:	learn: 0.2485649	test: 0.4072890	best: 0.4072146 (799)	total: 22.2s	remaining: 4m 14s
900:	learn: 0.2297202	test: 0.4023747	best: 0

In [88]:
from sklearn.metrics import f1_score
f1_score(new_train[target].values, (oof_pred > 0.5) + 0)

NameError: name 'new_train' is not defined

In [88]:
new_test_result = new_test[['zhdh', 'black_flag']].copy()
new_test_result['black_flag'] = y_pred
new_test_result['black_flag'] = new_test_result['black_flag'].apply(lambda x: (x > 0.52)+0)

In [89]:
new_test_result

Unnamed: 0,zhdh,black_flag
0,DDF394282B1E1508,0
2,41E4A8AECE47E5F3,0
4,6FBFEB03252FDB9F,0
5,4DEA40CF785FA423,0
6,8712DEE79BAE5383,1
...,...,...
5995,0228778D98151DEF,0
5996,FF83E6CFE3916793,0
5997,2947A98F10140EE0,0
5998,6EF9CDFEB8C86119,0


In [94]:
old_test_result = test[(test['black_flag'] > 0.98) | (test['black_flag'] < 0.02)][['zhdh', 'black_flag']].copy()

In [95]:
old_test_result['black_flag'] = old_test_result['black_flag'].apply(lambda x: (x > 0.52)+0)

In [97]:
new_test_result = pd.concat([new_test_result, old_test_result])
result = pd.read_csv('./data/test_dataset.csv')
result = result.merge(new_test_result, how='left', on='zhdh')
result.to_csv('result.csv', index=False)

In [98]:
result

Unnamed: 0,zhdh,black_flag
0,B6751CD225DD4886,0
1,8265CA869E4AFF16,0
2,06DD2A17463919FC,0
3,F36469EA4C5BD7FC,0
4,A18F7ACD7A3853D1,0
...,...,...
4795,5E1C4461BC135745,0
4796,0507C0E6649E637C,0
4797,2CB5C631D0BE5241,0
4798,193FE069CEA9AD06,0


In [52]:
import lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import log_evaluation

In [60]:
FOLDS = 5
folds = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=2023)

oof_pred = np.zeros((len(train),))
y_pred = np.zeros((len(test),))

for fold, (tr_ind, val_ind) in enumerate(folds.split(train, train[target])):
    x_train, x_val = train.iloc[tr_ind][features], train.iloc[val_ind][features]
    y_train, y_val = train.iloc[tr_ind][target], train.iloc[val_ind][target]
    
    dtrain = lgb.Dataset(x_train, label=y_train)
    dval = lgb.Dataset(x_val, label=y_val)

    parameters = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'tree_learner':'serial',
        'metric': 'auc',
        'min_child_weight': 4,
        'num_leaves': 64,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 4,
        'learning_rate': 0.02,
        'seed': 2023,
        'nthread': 32,
        'n_jobs':8,
        'silent': True,
        'verbose': -1,
    }

    model = lgb.train(
        parameters,
        dtrain,
        num_boost_round=8000,
        valid_sets=[dval],
        callbacks=[early_stopping(100), log_evaluation(100)],

    )
    oof_pred[val_ind] = model.predict(x_val)
    y_pred += model.predict(test[features]) / FOLDS

Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.977778
[200]	valid_0's auc: 0.976852
Early stopping, best iteration is:
[144]	valid_0's auc: 0.979167
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.959444
[200]	valid_0's auc: 0.965
[300]	valid_0's auc: 0.969444
[400]	valid_0's auc: 0.973148
[500]	valid_0's auc: 0.973796
[600]	valid_0's auc: 0.972315
Early stopping, best iteration is:
[533]	valid_0's auc: 0.974722
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.905741
[200]	valid_0's auc: 0.921667
[300]	valid_0's auc: 0.923889
Early stopping, best iteration is:
[282]	valid_0's auc: 0.925278
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.968519
Early stopping, best iteration is:
[80]	valid_0's auc: 0.97037
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.964259
[200]	valid_0's auc: 0.967315
[300]	valid_0's

In [61]:
from sklearn.metrics import f1_score
f1_score(train[target].values, (oof_pred > 0.5) + 0)

0.8648648648648648

In [62]:
oof = oof_pred
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4,0.6,0.01):
    preds = (oof.reshape((-1))>threshold).astype('int')
    m = f1_score(train['black_flag'].values.reshape((-1)), preds, average='macro')   
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold
    print(f'{threshold:.02f}, {m}')
print(f'{best_threshold:.02f}, {best_score}')

0.40, 0.9102985412158333
0.41, 0.9123638153608021
0.42, 0.9133998533874712
0.43, 0.916322068335067
0.44, 0.9150739850958669
0.45, 0.9161207975418415
0.46, 0.9157133908906958
0.47, 0.9165623696287026
0.48, 0.9150898680694286
0.49, 0.9136048941659953
0.50, 0.9121072291804
0.51, 0.9095351339496823
0.52, 0.9082439299830604
0.53, 0.9043494433668996
0.54, 0.9030442528101401
0.55, 0.9041033316199533
0.56, 0.9041033316199533
0.57, 0.8985800208606975
0.58, 0.8969809015369012
0.59, 0.8956449770149066
0.47, 0.9165623696287026


In [63]:
test_result = test[['zhdh', 'black_flag']].copy()
test_result['black_flag'] = y_pred
test_result['black_flag'] = test_result['black_flag'].apply(lambda x: (x > best_threshold)+0)

In [64]:
result = pd.read_csv('./data/test_dataset.csv')
result = result.merge(test_result, how='left', on='zhdh')
result.to_csv('result.csv', index=False)

In [6]:
fea = account_trade.groupby('zhdh').size().reset_index().rename(columns={0: 'trade_cnt'})

In [7]:
tmp = account_trade.groupby(['zhdh', 'jdbj']).size().reset_index()\
    .pivot_table(index='zhdh', columns=['jdbj'], values=[0])
tmp.columns = ['jdbj_0_cnt', 'jdbj_1_cnt']
tmp = tmp.reset_index()

In [8]:
fea = fea.merge(tmp, how='left', on='zhdh')
fea['jdbj_0_pct'] = fea['jdbj_0_cnt'] / fea['trade_cnt']
fea['jdbj_1_pct'] = fea['jdbj_1_cnt'] / fea['trade_cnt']

In [9]:
tmp = account_trade.groupby('zhdh')['jyje'].mean().reset_index().rename(columns={'jyje': 'jyje_mean'})
fea = fea.merge(tmp, how='left', on='zhdh')

tmp = account_trade.groupby('zhdh')['jyje'].std().reset_index().rename(columns={'jyje': 'jyje_std'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [10]:
tmp = account_trade.groupby('zhdh')['jyqd'].nunique().reset_index().rename(columns={'jyqd': 'jyqd_nunique'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [11]:
tmp = account_trade.groupby('zhdh')['dfzh'].nunique().reset_index().rename(columns={'dfzh': 'dfzh_nunique'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [12]:
tmp = account_trade.groupby(['zhdh', 'jdbj'])['jyje'].mean().reset_index()\
    .pivot_table(index='zhdh', columns=['jdbj'], values=['jyje'])
tmp.columns = ['jdbj_0_jyje', 'jdbj_1_jyje']
tmp = tmp.reset_index()
fea = fea.merge(tmp, how='left', on='zhdh')

In [13]:
tmp = account_trade.groupby('zhdh')['dfhh'].nunique().reset_index().rename(columns={'dfhh': 'dfhh_unique_nunique'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [14]:
tmp = account_trade.groupby('zhdh')['dfmccd'].mean().reset_index().rename(columns={'dfmccd': 'dfmccd_mean'})
fea = fea.merge(tmp, how='left', on='zhdh')

tmp = account_trade.groupby('zhdh')['dfmccd'].std().reset_index().rename(columns={'dfmccd': 'dfmccd_std'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [15]:
account_trade['jyrq'] = pd.to_datetime(account_trade['jyrq'])

In [16]:
account_trade['dayofweek'] = account_trade['jyrq'].dt.dayofweek
account_trade['is_wknd'] = account_trade['jyrq'].dt.dayofweek // 6
account_trade['is_month_start'] = account_trade['jyrq'].dt.is_month_start.astype(int)
account_trade['is_month_end'] = account_trade['jyrq'].dt.is_month_end.astype(int)

In [17]:
for col in ['dayofweek', 'is_wknd', 'is_month_start', 'is_month_end']:
    tmp = account_trade.groupby('zhdh')[col].mean().reset_index().rename(columns={col: f'{col}_std'})
    fea = fea.merge(tmp, how='left', on='zhdh')

In [18]:
account_trade['jy_hour'] = account_trade['jysj'].str[:2].astype(int)
tmp = account_trade.groupby('zhdh')['jy_hour'].mean().reset_index().rename(columns={'jy_hour': 'jy_hour_mean'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [19]:
account_trade['jyje_is_int'] = (account_trade['jyje'] == account_trade['jyje'].astype(int)) + 0
tmp = account_trade.groupby('zhdh')['jyje_is_int'].sum() / account_trade.groupby('zhdh').size()
tmp = tmp.reset_index().rename(columns={0: 'jyje_int_pct'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [20]:
tmp = account_trade.groupby('zhdh')['zydh'].nunique().reset_index().rename(columns={'zydh': 'zydh_nunique'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [21]:
for col in ['dfhh', 'jyqd', 'zydh']:
    tmp = account_trade.groupby(['zhdh', col]).size().reset_index().groupby('zhdh')\
        .apply(lambda x: x.sort_values(by=0).iloc[0][col])\
        .reset_index().rename(columns={0: f'most_{col}'})
    fea = fea.merge(tmp, how='left', on='zhdh')

In [22]:
df = account_static.merge(fea, how='left', on='zhdh')

In [23]:
y_train = pd.read_csv('./data/训练集标签.csv')

In [24]:
df = df.merge(y_train, how='left', on='zhdh')

In [25]:
train = df[df['zhdh'].isin(y_train['zhdh'].values)]

In [26]:
test_ids = pd.read_csv('./data/test_dataset.csv')['zhdh'].values

In [27]:
test = df[df['zhdh'].isin(test_ids)]

In [28]:
target = 'black_flag'
features = [c for c in train.columns if c not in [target, 'zhdh', 'khrq']]

In [33]:
from sklearn.model_selection import KFold
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier

In [34]:
FOLDS = 5
folds = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_pred = np.zeros((len(train),))
y_pred = np.zeros((len(test),))

for fold, (tr_ind, val_ind) in enumerate(folds.split(train)):
    x_train, x_val = train.iloc[tr_ind][features], train.iloc[val_ind][features]
    y_train, y_val = train.iloc[tr_ind][target], train.iloc[val_ind][target]
    
    classes = np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))
    
    params = {
        'task_type': 'CPU',
        'bootstrap_type': 'Bayesian',
        'boosting_type': 'Plain',
        'learning_rate': 0.01,
        'eval_metric': 'Logloss',
        'loss_function': 'Logloss',
        'iterations': 10000,
        'random_state': 42,
        'depth': 6,
        'leaf_estimation_iterations': 8,
        'reg_lambda': 5,
        'early_stopping_rounds': 100,
        'class_weights': class_weights,
        'cat_features': ['khjgdh', 'xb', '年龄', 'most_dfhh', 'most_jyqd', 'most_zydh'],
    }
    model = CatBoostClassifier(**params)
    model.fit(x_train,
               y_train,
               eval_set=(x_val, y_val),
               verbose=100)
    oof_pred[val_ind] = model.predict_proba(x_val)[:, 1]
    y_pred += model.predict_proba(test[features])[:, 1] / FOLDS

0:	learn: 0.6878285	test: 0.6880691	best: 0.6880691 (0)	total: 68.7ms	remaining: 11m 27s
100:	learn: 0.4164203	test: 0.4695984	best: 0.4695984 (100)	total: 443ms	remaining: 43.4s
200:	learn: 0.3238651	test: 0.4059339	best: 0.4059339 (200)	total: 811ms	remaining: 39.5s
300:	learn: 0.2726680	test: 0.3751292	best: 0.3751292 (300)	total: 1.25s	remaining: 40.3s
400:	learn: 0.2361264	test: 0.3582961	best: 0.3582961 (400)	total: 1.6s	remaining: 38.4s
500:	learn: 0.2070088	test: 0.3449625	best: 0.3449625 (500)	total: 1.95s	remaining: 37s
600:	learn: 0.1836315	test: 0.3390327	best: 0.3390327 (600)	total: 2.3s	remaining: 36s
700:	learn: 0.1630827	test: 0.3338649	best: 0.3338649 (700)	total: 2.65s	remaining: 35.2s
800:	learn: 0.1452078	test: 0.3311609	best: 0.3310738 (796)	total: 3.02s	remaining: 34.7s
900:	learn: 0.1272554	test: 0.3299845	best: 0.3293977 (877)	total: 3.39s	remaining: 34.3s
1000:	learn: 0.1140597	test: 0.3289071	best: 0.3286708 (970)	total: 3.75s	remaining: 33.7s
Stopped by overf

In [42]:
from sklearn.metrics import f1_score
f1_score(train[target].values, (oof_pred > 0.5) + 0)

0.7824620573355818

In [43]:
test_result = test[['zhdh', 'black_flag']].copy()
test_result['black_flag'] = y_pred
test_result['black_flag'] = test_result['black_flag'].apply(lambda x: (x > 0.5)+0)

In [44]:
result = pd.read_csv('./data/test_dataset.csv')
result = result.merge(test_result, how='left', on='zhdh')
result.to_csv('result.csv', index=False)

In [45]:
result[result['black_flag'] == 0].shape, result[result['black_flag'] == 1].shape

((3595, 2), (1205, 2))

In [46]:
train[train['black_flag'] == 0].shape, train[train['black_flag'] == 1].shape,

((900, 30), (300, 30))

In [42]:
pd.DataFrame(model.feature_importances_, features).sort_values(by=0, ascending=False).loc['last_day_trade_cnt']

0    0.271546
Name: last_day_trade_cnt, dtype: float64