In [1]:
import os
import pickle
import numpy as np
import pandas as pd

In [2]:
from tsfresh.feature_extraction.feature_calculators import longest_strike_above_mean, mean_abs_change, \
    mean_second_derivative_central, sample_entropy, benford_correlation, count_above_mean,\
    percentage_of_reoccurring_datapoints_to_all_datapoints

In [3]:
account_static = pd.read_csv('./data/账户静态信息.csv')

In [4]:
account_trade = pd.read_csv('./data/账户交易信息.csv')

In [5]:
y_train = pd.read_csv('./data/训练集标签.csv')

In [6]:
y_test = pd.read_csv('./data/test_dataset.csv')

In [7]:
def get_time_feature(df, col):
    
    df_copy = df.copy()
    prefix = col + "_"
    df_copy['new_'+col] = df_copy[col].astype(str)
    
    col = 'new_'+col
    df_copy[col] = pd.to_datetime(df_copy[col], format='%Y-%m-%d')
    df_copy[prefix + 'month'] = df_copy[col].dt.month
    df_copy[prefix + 'day'] = df_copy[col].dt.day
    df_copy[prefix + 'weekofyear'] = df_copy[col].dt.weekofyear
    df_copy[prefix + 'dayofyear'] = df_copy[col].dt.dayofyear
    df_copy[prefix + 'dayofweek'] = df_copy[col].dt.dayofweek
    df_copy[prefix + 'is_wknd'] = df_copy[col].dt.dayofweek // 6
    df_copy[prefix + 'is_month_start'] = df_copy[col].dt.is_month_start.astype(int)
    df_copy[prefix + 'is_month_end'] = df_copy[col].dt.is_month_end.astype(int)
    del df_copy[col]
    
    df_copy[prefix + 'hour'] = df_copy['jysj'].apply(lambda x:int(x.split(':')[0]))
    df_copy[prefix + 'minu'] = df_copy['jysj'].apply(lambda x:int(x.split(':')[1]))
    df_copy[prefix + 'date'] = df_copy['jysj'].apply(lambda x:int(x.split(':')[0])*60 + int(x.split(':')[1]))
    
    return df_copy   

account_trade = get_time_feature(account_trade, "jyrq")
time_cols = [f for f in account_trade.columns if 'jyrq_' in f]
print(time_cols)


  # This is added back by InteractiveShellApp.init_path()


['jyrq_month', 'jyrq_day', 'jyrq_weekofyear', 'jyrq_dayofyear', 'jyrq_dayofweek', 'jyrq_is_wknd', 'jyrq_is_month_start', 'jyrq_is_month_end', 'jyrq_hour', 'jyrq_minu', 'jyrq_date']


In [8]:
def get_base_feat(df1_, df2_):
    df1 = df1_.copy() # 构建特征数据 
    df2 = df2_.copy()

    agg_func = {
        'dfzh': ['nunique','count'],
        'dfhh': ['nunique'],
        'jyqd': ['nunique'],
        'zydh': ['nunique'],
        'jyje': ['sum','mean','median','max','min','std',np.ptp,
                longest_strike_above_mean, mean_abs_change, mean_second_derivative_central,
                 sample_entropy, benford_correlation, count_above_mean,
                 percentage_of_reoccurring_datapoints_to_all_datapoints],
#         'jyje': ['sum','mean','max','min','std',np.ptp],
        'zhye': ['sum','mean', 'median', 'max','min','std',np.ptp],
        'dfmccd': ['mean', 'median', 'max','min','std',np.ptp],
    }
    
    for col in time_cols:
        agg_func[col] = ['mean','min','max',np.ptp]
    
    agg_df = df1[df1['jdbj']==0].groupby(['zhdh']).agg(agg_func).reset_index()
    agg_df.columns = ['zhdh'] + ['zhdh_jdbj0_' + '_'.join(f).strip()
                                 for f in agg_df.columns.values if f[0] not in ['zhdh']]
    df2 = df2.merge(agg_df, on=['zhdh'], how='left')
    
    agg_df = df1[df1['jdbj']==1].groupby(['zhdh']).agg(agg_func).reset_index()
    agg_df.columns = ['zhdh'] + ['zhdh_jdbj1_' + '_'.join(f).strip()
                                 for f in agg_df.columns.values if f[0] not in ['zhdh']]
    df2 = df2.merge(agg_df, on=['zhdh'], how='left')
    
    return df2

if os.path.exists('train_label.pkl'):
    with open('train_label.pkl', 'rb') as file:
        train_label = pickle.load(file)
    with open('test_label.pkl', 'rb') as file:
        test_label = pickle.load(file)
else:
    train_label = get_base_feat(account_trade, y_train)
    test_label = get_base_feat(account_trade, y_test)
    with open('train_label.pkl', 'wb') as file:
        pickle.dump(train_label, file)
    with open('test_label.pkl', 'wb') as file:
        pickle.dump(test_label, file)

In [9]:
train_label = train_label.replace([np.inf, -np.inf], np.nan)
test_label = test_label.replace([np.inf, -np.inf], np.nan)

In [10]:
fea = pd.concat([train_label, test_label])

In [11]:
account_trade['jyje_is_int'] = (account_trade['jyje'] == account_trade['jyje'].astype(int)) + 0
tmp = account_trade.groupby('zhdh')['jyje_is_int'].sum() / account_trade.groupby('zhdh').size()
tmp = tmp.reset_index().rename(columns={0: 'jyje_int_pct'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [12]:
account_trade['dt'] = account_trade['jyrq'] + ' ' + account_trade['jysj']
account_trade['dt'] = pd.to_datetime(account_trade['dt'])

In [13]:
account_trade['month'] = account_trade['dt'].dt.month
account_trade['day'] = account_trade['dt'].dt.day
account_trade['weekofyear'] = account_trade['dt'].dt.weekofyear
account_trade['dayofweek'] = account_trade['dt'].dt.dayofweek

account_trade['is_wknd'] = account_trade['dt'].dt.dayofweek // 6
account_trade['is_month_start'] = account_trade['dt'].dt.is_month_start.astype(int)
account_trade['is_month_end'] = account_trade['dt'].dt.is_month_end.astype(int)

account_trade['hour'] = account_trade['jysj'].apply(lambda x:int(x.split(':')[0]))
account_trade['minu'] = account_trade['jysj'].apply(lambda x:int(x.split(':')[1]))

  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
account_trade['jyje_label'] = pd.qcut(account_trade['jyje'], 10, labels=range(10))

In [15]:
cat_cols = ['dfzh', 'dfhh', 'jyqd', 'zydh', 'jyje_label',
            'month', 'day', 'weekofyear', 'dayofweek', 'is_wknd',
            'is_month_start', 'is_month_end', 'hour', 'minu']

In [16]:
for col in cat_cols:
    tmp = account_trade.groupby(['zhdh', col]).size().reset_index().groupby('zhdh')\
        .apply(lambda x: x.sort_values(by=0).iloc[-1][col])\
        .reset_index().rename(columns={0: f'most_{col}'})
    fea = fea.merge(tmp, how='left', on='zhdh')
    
    tmp = account_trade[account_trade['jdbj'] == 0].groupby(['zhdh', col]).size().reset_index().groupby('zhdh')\
        .apply(lambda x: x.sort_values(by=0).iloc[-1][col])\
        .reset_index().rename(columns={0: f'jdbj0_most_{col}'})
    fea = fea.merge(tmp, how='left', on='zhdh')
    
    tmp = account_trade[account_trade['jdbj'] == 1].groupby(['zhdh', col]).size().reset_index().groupby('zhdh')\
        .apply(lambda x: x.sort_values(by=0).iloc[-1][col])\
        .reset_index().rename(columns={0: f'jdbj1_most_{col}'})
    fea = fea.merge(tmp, how='left', on='zhdh')

In [17]:
# for col in ['dfhh', 'jyqd', 'zydh']:
#     tmp = account_trade.groupby(['zhdh', col]).size().reset_index().groupby('zhdh')\
#         .apply(lambda x: x.sort_values(by=0).iloc[0][col])\
#         .reset_index().rename(columns={0: f'most_{col}'})
#     fea = fea.merge(tmp, how='left', on='zhdh')

In [18]:
for col in cat_cols:
    tmp = account_trade.groupby(['zhdh', col])['jyje'].sum().reset_index().groupby('zhdh')\
            .apply(lambda x: x.sort_values(by='jyje').iloc[-1][col])\
            .reset_index().rename(columns={0: f'most_jyje_{col}'})
    fea = fea.merge(tmp, how='left', on='zhdh')
    
    tmp = account_trade[account_trade['jdbj'] == 0].groupby(['zhdh', col])['jyje'].sum()\
            .reset_index().groupby('zhdh')\
            .apply(lambda x: x.sort_values(by='jyje').iloc[-1][col])\
            .reset_index().rename(columns={0: f'jdbj0_most_jyje_{col}'})
    fea = fea.merge(tmp, how='left', on='zhdh')
    
    tmp = account_trade[account_trade['jdbj'] == 1].groupby(['zhdh', col])['jyje'].sum()\
            .reset_index().groupby('zhdh')\
            .apply(lambda x: x.sort_values(by='jyje').iloc[-1][col])\
            .reset_index().rename(columns={0: f'jdbj1_most_jyje_{col}'})
    fea = fea.merge(tmp, how='left', on='zhdh')

In [19]:
for c in cat_cols:
    tmp = (account_trade.groupby([c]).size() / account_trade.shape[0])\
        .reset_index().rename(columns={0: f'{c}_ratio'})
    account_trade = account_trade.merge(tmp, how='left', on=c)

In [20]:
agg_func = {}
for c in cat_cols:
    agg_func[f'{c}_ratio'] = ['sum','mean','max','min','std']
tmp = account_trade.groupby('zhdh').agg(agg_func)
tmp.columns = [f'{c[0]}_{c[1]}' for c in tmp.columns]
tmp = tmp.reset_index()
fea = fea.merge(tmp, how='left', on='zhdh')

In [21]:
for col in ['jyqd']:
    tmp = account_trade.merge(y_train, how='left', on='zhdh').groupby(['black_flag', col])\
        .size().reset_index().rename({0: f'flag_{col}_cnt'},axis=1)
    tmp2 = account_trade.merge(y_train, how='left', on='zhdh').groupby('black_flag').size()\
        .reset_index().rename({0:'flag_cnt'},axis=1)
    tmp = tmp.merge(tmp2, how='left', on='black_flag')
    tmp[f'p({col}|flag)'] = tmp[f'flag_{col}_cnt'] / tmp['flag_cnt']
    for i in range(2):
        tmp3 = account_trade.merge(tmp[tmp['black_flag'] == i][[col, f'p({col}|flag)']], how='left', on=col)\
            .groupby('zhdh').agg({f'p({col}|flag)': ['mean', 'sum', 'std', 'max']})
        tmp3.columns = [f'{c[0]}_{c[1]}_lag={i}' for c in tmp3.columns]
        tmp3 = tmp3.reset_index()
        fea = fea.merge(tmp3, how='left', on='zhdh')

In [22]:
tmp = account_trade.groupby('zhdh').apply(lambda x: (x['dt'].max() - x['dt'].min()).days)\
    .reset_index().rename(columns={0: 'ndays'})
tmp['trade_time_per_day'] = account_trade.groupby('zhdh').size().values / (tmp['ndays'].values+1)
fea = fea.merge(tmp, how='left', on='zhdh')

In [23]:
tmp = account_trade.groupby(['zhdh', 'jyje_label']).size().reset_index()\
    .rename(columns={0: 'cnt'}).pivot(index='zhdh', columns=['jyje_label'], values=['cnt'])
tmp.columns = [f'jyje_label_{i}' for i in range(10)]
tmp = (tmp.T /  account_trade.groupby(['zhdh']).size()).T
fea = fea.merge(tmp, how='left', on='zhdh')

In [24]:
tmp = account_trade.groupby('zhdh')['dt'].min().reset_index().merge(account_static, how='left')
tmp['kh2jy_days'] = (tmp['dt'] - pd.to_datetime(tmp['khrq'])).dt.days
fea = fea.merge(tmp[['zhdh', 'kh2jy_days']], how='left', on='zhdh')

In [25]:
tmp = account_trade.sort_values(by='dt').groupby('zhdh').tail(20).groupby('zhdh')\
    .apply(lambda x: (x['dt'].max() - x['dt'].min()).seconds / 3600).reset_index()\
    .rename(columns={0: 'last20trade_hours'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [26]:
tmp = account_trade.groupby('zhdh').apply(lambda x: x[x['jyrq'] == x['jyrq'].min()].shape[0])\
    .reset_index().rename(columns={0: 'last_day_trade_cnt'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [27]:
tmp = account_trade.groupby(['zhdh', 'jdbj'])['jyje'].sum().reset_index()\
    .pivot(index='zhdh', columns=['jdbj'], values=['jyje'])
tmp.columns = ['jdbj_0', 'jdbj_1']
tmp = tmp.reset_index()
tmp['in_out_diff'] = (tmp['jdbj_0'] - tmp['jdbj_1'])
tmp['in_out_ratio'] = (tmp['jdbj_0'] / tmp['jdbj_1'])
fea = fea.merge(tmp, how='left', on='zhdh')

In [28]:
tmp = account_trade.groupby(['zhdh', 'jyqd'])['jyje'].sum().reset_index()\
    .pivot(index='zhdh', columns=['jyqd'], values=['jyje'])
jyqd_cols = [f'{c[1]}_sum' for c in tmp.columns]
tmp.columns = jyqd_cols
tmp = tmp.reset_index()
tmp = tmp.merge(account_trade.groupby(['zhdh'])['jyje'].sum().reset_index(), how='left', on='zhdh')
for col in jyqd_cols:
    tmp[col.split('_')[0]+'_ratio'] = tmp[col] / tmp['jyje']
tmp = tmp.drop(columns=['jyje'])
fea = fea.merge(tmp, how='left', on='zhdh')

In [29]:
# tmp = account_trade.groupby(['zhdh', 'zydh'])['jyje'].sum().reset_index()\
#     .pivot(index='zhdh', columns=['zydh'], values=['jyje'])
# zydh_cols = [f'{c[1]}_sum' for c in tmp.columns]
# tmp.columns = zydh_cols
# tmp = tmp.reset_index()
# tmp = tmp.merge(account_trade.groupby(['zhdh'])['jyje'].sum().reset_index(), how='left', on='zhdh')
# for col in zydh_cols:
#     tmp[col.split('_')[0]+'_ratio'] = tmp[col] / tmp['jyje']
# tmp = tmp.drop(columns=['jyje'])
# fea = fea.merge(tmp, how='left', on='zhdh')

In [30]:
# tmp = account_trade.merge(account_static[['zhdh', 'khjgdh']], how='left', on='zhdh').groupby('zhdh')\
#     .apply(lambda x: x[x['jyqd'] == x['khjgdh']].shape[0] / x.shape[0])
# tmp = tmp.reset_index().rename(columns={0: 'same_bank_ratio'})
# fea = fea.merge(tmp, how='left', on='zhdh')

In [31]:
with open('w2v_fea.pkl', 'rb') as file:
    w2v_fea = pickle.load(file)
fea = fea.merge(w2v_fea, how='left', on='zhdh')

with open('tfidf_fea.pkl', 'rb') as file:
    tfidf_fea = pickle.load(file)
fea = fea.merge(tfidf_fea, how='left', on='zhdh')

In [32]:
with open('w2v_fea_zydh.pkl', 'rb') as file:
    w2v_fea = pickle.load(file)
fea = fea.merge(w2v_fea, how='left', on='zhdh')

with open('tfidf_fea_zydh.pkl', 'rb') as file:
    tfidf_fea = pickle.load(file)
fea = fea.merge(tfidf_fea, how='left', on='zhdh')

In [33]:
# with open('w2v_fea_dfzh.pkl', 'rb') as file:
#     w2v_fea = pickle.load(file)
# fea = fea.merge(w2v_fea, how='left', on='zhdh')

# with open('tfidf_fea_dfzh.pkl', 'rb') as file:
#     tfidf_fea = pickle.load(file)
# fea = fea.merge(tfidf_fea, how='left', on='zhdh')

In [34]:
account_static['khrq']  = pd.to_datetime(account_static['khrq'], format='%Y-%m-%d')
account_static['year']  = account_static['khrq'].dt.year
account_static['month'] = account_static['khrq'].dt.month
account_static['day']   = account_static['khrq'].dt.day

In [35]:
df = account_static.merge(fea, how='left', on='zhdh')

In [36]:
def label_encode(series):
    unique = list(series.unique())
    return series.map(dict(zip(
        unique, range(series.nunique())
    )))

for col in ['khrq', 'khjgdh'] + [f'most_{c}' for c in cat_cols] + [f'most_jyje_{c}' for c in cat_cols] +\
    [f'jdbj0_most_{c}' for c in cat_cols] + [f'jdbj1_most_{c}' for c in cat_cols] +\
    [f'jdbj0_most_jyje_{c}' for c in cat_cols] + [f'jdbj1_most_jyje_{c}' for c in cat_cols]:
    df[col] = label_encode(df[col])

In [37]:
train = df[df['zhdh'].isin(y_train['zhdh'].values)]
test_ids = pd.read_csv('./data/test_dataset.csv')['zhdh'].values
test = df[df['zhdh'].isin(test_ids)]

In [38]:
target = 'black_flag'
features = [c for c in train.columns if c not in [target, 'zhdh',]]

In [39]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier

In [40]:
import xgboost as xgb

In [41]:
FOLDS = 5
folds = KFold(n_splits=FOLDS, shuffle=True, random_state=2023)

oof_pred = np.zeros((len(train),))
y_pred = np.zeros((len(test),))

for fold, (tr_ind, val_ind) in enumerate(folds.split(train)):
    
    x_train, x_val = train.iloc[tr_ind][features], train.iloc[val_ind][features]
    y_train, y_val = train.iloc[tr_ind][target], train.iloc[val_ind][target]
    
    train_matrix = xgb.DMatrix(x_train , label=y_train)
    valid_matrix = xgb.DMatrix(x_val , label=y_val)
    test_matrix = xgb.DMatrix(test[features])
            
    params = {'booster': 'gbtree',
              'objective': 'binary:logistic',
              'eval_metric': 'auc',
              'gamma': 1,
              'min_child_weight': 1.5,
              'max_depth': 5,
              'lambda': 10,
              'subsample': 0.7,
              'colsample_bytree': 0.7,
              'colsample_bylevel': 0.7,
              'eta': 0.05,
              'tree_method': 'exact',
              'seed': 2023,
              'nthread': 8
              }

    watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]

    model = xgb.train(params, train_matrix, num_boost_round=10000, evals=watchlist,
                      verbose_eval=1000, early_stopping_rounds=500)
    val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
    test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
    
    oof_pred[val_ind] = val_pred
    y_pred += test_pred / FOLDS

[0]	train-auc:0.92013	eval-auc:0.91974
[581]	train-auc:0.99996	eval-auc:0.96242
[0]	train-auc:0.92208	eval-auc:0.88942




[867]	train-auc:0.99999	eval-auc:0.97076
[0]	train-auc:0.92532	eval-auc:0.90362
[652]	train-auc:0.99998	eval-auc:0.97332
[0]	train-auc:0.91133	eval-auc:0.88103
[627]	train-auc:0.99998	eval-auc:0.97005
[0]	train-auc:0.92291	eval-auc:0.90040
[795]	train-auc:0.99999	eval-auc:0.96387


In [42]:
from sklearn.metrics import f1_score
f1_score(train[target].values, (oof_pred > 0.5) + 0)

0.8651488616462347

In [43]:
oof = oof_pred
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4,0.6,0.01):
    preds = (oof.reshape((-1))>threshold).astype('int')
    m = f1_score(train['black_flag'].values.reshape((-1)), preds, average='macro')   
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold
    print(f'{threshold:.02f}, {m}')
print(f'{best_threshold:.02f}, {best_score}')

0.40, 0.9158772262220538
0.41, 0.9158772262220538
0.42, 0.9169163444329682
0.43, 0.9144381773050118
0.44, 0.9154788011695907
0.45, 0.9154788011695907
0.46, 0.9142346634901728
0.47, 0.9140295222620551
0.48, 0.9140295222620551
0.49, 0.9127786873957553
0.50, 0.9115246768592027
0.51, 0.9134041687233176
0.52, 0.9134041687233176
0.53, 0.9155072219141087
0.54, 0.9152993866940751
0.55, 0.9112698913056168
0.56, 0.9123277849809702
0.57, 0.9110473009878431
0.58, 0.9058912102390363
0.59, 0.9058912102390363
0.42, 0.9169163444329682


In [44]:
test_result = test[['zhdh', 'black_flag']].copy()
test_result['black_flag'] = y_pred
test_result['black_flag'] = test_result['black_flag'].apply(lambda x: (x > best_threshold)+0)

In [45]:
result = pd.read_csv('./data/test_dataset.csv')
result = result.merge(test_result, how='left', on='zhdh')
result.to_csv('result.csv', index=False)

In [50]:
for c in [f'jdbj0_most_{c}' for c in cat_cols]:
    train[c] = train[c].astype(str)
    test[c] = test[c].astype(str)

for c in [f'jdbj1_most_{c}' for c in cat_cols]:
    train[c] = train[c].astype(str)
    test[c] = test[c].astype(str)

for c in [f'jdbj0_most_jyje_{c}' for c in cat_cols]:
    train[c] = train[c].astype(str)
    test[c] = test[c].astype(str)
    
for c in [f'jdbj1_most_jyje_{c}' for c in cat_cols]:
    train[c] = train[c].astype(str)
    test[c] = test[c].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead


In [51]:
FOLDS = 5
folds = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=2023)

oof_pred = np.zeros((len(train),))
y_pred = np.zeros((len(test),))

for fold, (tr_ind, val_ind) in enumerate(folds.split(train, train[target])):
    x_train, x_val = train.iloc[tr_ind][features], train.iloc[val_ind][features]
    y_train, y_val = train.iloc[tr_ind][target], train.iloc[val_ind][target]
    
    classes = np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))
    
    params = {
        'task_type': 'CPU',
        'bootstrap_type': 'Bayesian',
        'boosting_type': 'Plain',
        'learning_rate': 0.01,
        'eval_metric': 'Logloss',
        'loss_function': 'Logloss',
        'iterations': 10000,
        'random_state': 42,
        'depth': 6,
        'leaf_estimation_iterations': 8,
        'reg_lambda': 5,
        'early_stopping_rounds': 100,
        'class_weights': class_weights,
        'cat_features': ['khjgdh', 'xb', '年龄'] + [f'most_{c}' for c in cat_cols] + \
                        [f'most_jyje_{c}' for c in cat_cols] +\
                        [f'jdbj0_most_{c}' for c in cat_cols] +\
                        [f'jdbj1_most_{c}' for c in cat_cols] +\
                        [f'jdbj0_most_jyje_{c}' for c in cat_cols] +\
                        [f'jdbj1_most_jyje_{c}' for c in cat_cols],
    }
    model = CatBoostClassifier(**params)
    model.fit(x_train,
               y_train,
               eval_set=(x_val, y_val),
               verbose=100)
    oof_pred[val_ind] = model.predict_proba(x_val)[:, 1]
    y_pred += model.predict_proba(test[features])[:, 1] / FOLDS

0:	learn: 0.6880427	test: 0.6879814	best: 0.6879814 (0)	total: 68.9ms	remaining: 11m 28s
100:	learn: 0.3096384	test: 0.3354541	best: 0.3354541 (100)	total: 5.89s	remaining: 9m 37s
200:	learn: 0.2206138	test: 0.2652263	best: 0.2652263 (200)	total: 10.6s	remaining: 8m 37s
300:	learn: 0.1808502	test: 0.2421781	best: 0.2421781 (300)	total: 15.4s	remaining: 8m 17s
400:	learn: 0.1529521	test: 0.2333431	best: 0.2333431 (400)	total: 20.4s	remaining: 8m 7s
500:	learn: 0.1336148	test: 0.2269591	best: 0.2269591 (500)	total: 25.3s	remaining: 8m
600:	learn: 0.1163591	test: 0.2195371	best: 0.2195371 (600)	total: 30.3s	remaining: 7m 53s
700:	learn: 0.1021003	test: 0.2154583	best: 0.2154583 (700)	total: 35.2s	remaining: 7m 47s
800:	learn: 0.0901286	test: 0.2128777	best: 0.2128777 (800)	total: 40.2s	remaining: 7m 41s
900:	learn: 0.0790381	test: 0.2101774	best: 0.2101235 (899)	total: 45.1s	remaining: 7m 35s
1000:	learn: 0.0690173	test: 0.2084473	best: 0.2082868 (996)	total: 50s	remaining: 7m 29s
1100:	l

In [52]:
from sklearn.metrics import f1_score
f1_score(train[target].values, (oof_pred > 0.5) + 0)

0.8647746243739565

In [53]:
oof = oof_pred
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4,0.6,0.01):
    preds = (oof.reshape((-1))>threshold).astype('int')
    m = f1_score(train['black_flag'].values.reshape((-1)), preds, average='macro')   
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold
    print(f'{threshold:.02f}, {m}')
print(f'{best_threshold:.02f}, {best_score}')

0.40, 0.903123891662033
0.41, 0.900737289237177
0.42, 0.9025041285119738
0.43, 0.9013028030003948
0.44, 0.9022943446905485
0.45, 0.9032878468724241
0.46, 0.905280780075188
0.47, 0.9062802349609639
0.48, 0.9072816980694725
0.49, 0.9113078762301413
0.50, 0.9098998052463897
0.51, 0.9094950999790499
0.52, 0.9094950999790499
0.53, 0.9088760536206301
0.54, 0.9109215950296505
0.55, 0.9107155470791835
0.56, 0.9094804175970068
0.57, 0.9094804175970068
0.58, 0.9113300492610837
0.59, 0.9100875508247193
0.58, 0.9113300492610837


In [60]:
test_result = test[['zhdh', 'black_flag']].copy()
test_result['black_flag'] = y_pred
test_result['black_flag'] = test_result['black_flag'].apply(lambda x: (x > 0.50)+0)

In [61]:
result = pd.read_csv('./data/test_dataset.csv')
result = result.merge(test_result, how='left', on='zhdh')
result.to_csv('result.csv', index=False)

In [62]:
pd.DataFrame(model.feature_importances_, features).sort_values(by=0, ascending=False).to_csv('fi.csv')

In [82]:
test['black_flag'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [83]:
new_train = pd.concat([train, test[(test['black_flag'] > 0.98) | (test['black_flag'] < 0.02)].copy()])
new_test = test[~((test['black_flag'] > 0.98) | (test['black_flag'] < 0.02))].copy()

In [84]:
new_train['black_flag'] = (new_train['black_flag'] > 0) + 0

In [85]:
FOLDS = 5
folds = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=2023)

oof_pred = np.zeros((len(new_train),))
y_pred = np.zeros((len(new_test),))

for fold, (tr_ind, val_ind) in enumerate(folds.split(new_train, new_train[target])):
    x_train, x_val = new_train.iloc[tr_ind][features], new_train.iloc[val_ind][features]
    y_train, y_val = new_train.iloc[tr_ind][target], new_train.iloc[val_ind][target]
    
    classes = np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))
    
    params = {
        'task_type': 'CPU',
        'bootstrap_type': 'Bayesian',
        'boosting_type': 'Plain',
        'learning_rate': 0.01,
        'eval_metric': 'Logloss',
        'loss_function': 'Logloss',
        'iterations': 10000,
        'random_state': 42,
        'depth': 6,
        'leaf_estimation_iterations': 8,
        'reg_lambda': 5,
        'early_stopping_rounds': 100,
        'class_weights': class_weights,
        'cat_features': ['khjgdh', 'xb', '年龄'] + [f'most_{c}' for c in cat_cols] + \
                        [f'most_jyje_{c}' for c in cat_cols],
    }
    model = CatBoostClassifier(**params)
    model.fit(x_train,
               y_train,
               eval_set=(x_val, y_val),
               verbose=100)
    oof_pred[val_ind] = model.predict_proba(x_val)[:, 1]
    y_pred += model.predict_proba(new_test[features])[:, 1] / FOLDS

0:	learn: 0.6878241	test: 0.6885613	best: 0.6885613 (0)	total: 65.1ms	remaining: 10m 50s
100:	learn: 0.4417971	test: 0.4807257	best: 0.4807257 (100)	total: 4.09s	remaining: 6m 40s
200:	learn: 0.3834371	test: 0.4518828	best: 0.4518828 (200)	total: 7.25s	remaining: 5m 53s
300:	learn: 0.3470120	test: 0.4400088	best: 0.4400088 (300)	total: 9.79s	remaining: 5m 15s
400:	learn: 0.3193859	test: 0.4321788	best: 0.4321496 (399)	total: 12.4s	remaining: 4m 56s
500:	learn: 0.2951583	test: 0.4267661	best: 0.4266982 (499)	total: 15.1s	remaining: 4m 45s
600:	learn: 0.2751183	test: 0.4230531	best: 0.4230531 (600)	total: 17.7s	remaining: 4m 36s
700:	learn: 0.2566864	test: 0.4197406	best: 0.4194264 (687)	total: 20.4s	remaining: 4m 30s
800:	learn: 0.2390636	test: 0.4190372	best: 0.4186072 (792)	total: 23s	remaining: 4m 24s
900:	learn: 0.2198738	test: 0.4158454	best: 0.4158359 (898)	total: 25.7s	remaining: 4m 20s
1000:	learn: 0.2019708	test: 0.4146883	best: 0.4144218 (981)	total: 28.5s	remaining: 4m 15s
11

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3764534016
bestIteration = 1616

Shrink model to first 1617 iterations.
0:	learn: 0.6883180	test: 0.6885580	best: 0.6885580 (0)	total: 39.3ms	remaining: 6m 33s
100:	learn: 0.4491532	test: 0.4859101	best: 0.4859101 (100)	total: 2.78s	remaining: 4m 32s
200:	learn: 0.3884458	test: 0.4518818	best: 0.4518818 (200)	total: 5.54s	remaining: 4m 30s
300:	learn: 0.3556593	test: 0.4401891	best: 0.4401619 (299)	total: 8.28s	remaining: 4m 26s
400:	learn: 0.3293234	test: 0.4315159	best: 0.4315159 (400)	total: 11s	remaining: 4m 23s
500:	learn: 0.3064125	test: 0.4239739	best: 0.4239443 (499)	total: 13.8s	remaining: 4m 20s
600:	learn: 0.2852920	test: 0.4163919	best: 0.4163919 (600)	total: 16.5s	remaining: 4m 18s
700:	learn: 0.2668955	test: 0.4128495	best: 0.4127409 (694)	total: 19.3s	remaining: 4m 16s
800:	learn: 0.2485649	test: 0.4072890	best: 0.4072146 (799)	total: 22.2s	remaining: 4m 14s
900:	learn: 0.2297202	test: 0.4023747	best: 0

In [87]:
from sklearn.metrics import f1_score
f1_score(new_train[target].values, (oof_pred > 0.5) + 0)

0.823696682464455

In [88]:
new_test_result = new_test[['zhdh', 'black_flag']].copy()
new_test_result['black_flag'] = y_pred
new_test_result['black_flag'] = new_test_result['black_flag'].apply(lambda x: (x > 0.52)+0)

In [89]:
new_test_result

Unnamed: 0,zhdh,black_flag
0,DDF394282B1E1508,0
2,41E4A8AECE47E5F3,0
4,6FBFEB03252FDB9F,0
5,4DEA40CF785FA423,0
6,8712DEE79BAE5383,1
...,...,...
5995,0228778D98151DEF,0
5996,FF83E6CFE3916793,0
5997,2947A98F10140EE0,0
5998,6EF9CDFEB8C86119,0


In [94]:
old_test_result = test[(test['black_flag'] > 0.98) | (test['black_flag'] < 0.02)][['zhdh', 'black_flag']].copy()

In [95]:
old_test_result['black_flag'] = old_test_result['black_flag'].apply(lambda x: (x > 0.52)+0)

In [97]:
new_test_result = pd.concat([new_test_result, old_test_result])
result = pd.read_csv('./data/test_dataset.csv')
result = result.merge(new_test_result, how='left', on='zhdh')
result.to_csv('result.csv', index=False)

In [98]:
result

Unnamed: 0,zhdh,black_flag
0,B6751CD225DD4886,0
1,8265CA869E4AFF16,0
2,06DD2A17463919FC,0
3,F36469EA4C5BD7FC,0
4,A18F7ACD7A3853D1,0
...,...,...
4795,5E1C4461BC135745,0
4796,0507C0E6649E637C,0
4797,2CB5C631D0BE5241,0
4798,193FE069CEA9AD06,0


In [52]:
import lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import log_evaluation

In [60]:
FOLDS = 5
folds = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=2023)

oof_pred = np.zeros((len(train),))
y_pred = np.zeros((len(test),))

for fold, (tr_ind, val_ind) in enumerate(folds.split(train, train[target])):
    x_train, x_val = train.iloc[tr_ind][features], train.iloc[val_ind][features]
    y_train, y_val = train.iloc[tr_ind][target], train.iloc[val_ind][target]
    
    dtrain = lgb.Dataset(x_train, label=y_train)
    dval = lgb.Dataset(x_val, label=y_val)

    parameters = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'tree_learner':'serial',
        'metric': 'auc',
        'min_child_weight': 4,
        'num_leaves': 64,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 4,
        'learning_rate': 0.02,
        'seed': 2023,
        'nthread': 32,
        'n_jobs':8,
        'silent': True,
        'verbose': -1,
    }

    model = lgb.train(
        parameters,
        dtrain,
        num_boost_round=8000,
        valid_sets=[dval],
        callbacks=[early_stopping(100), log_evaluation(100)],

    )
    oof_pred[val_ind] = model.predict(x_val)
    y_pred += model.predict(test[features]) / FOLDS

Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.977778
[200]	valid_0's auc: 0.976852
Early stopping, best iteration is:
[144]	valid_0's auc: 0.979167
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.959444
[200]	valid_0's auc: 0.965
[300]	valid_0's auc: 0.969444
[400]	valid_0's auc: 0.973148
[500]	valid_0's auc: 0.973796
[600]	valid_0's auc: 0.972315
Early stopping, best iteration is:
[533]	valid_0's auc: 0.974722
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.905741
[200]	valid_0's auc: 0.921667
[300]	valid_0's auc: 0.923889
Early stopping, best iteration is:
[282]	valid_0's auc: 0.925278
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.968519
Early stopping, best iteration is:
[80]	valid_0's auc: 0.97037
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.964259
[200]	valid_0's auc: 0.967315
[300]	valid_0's

In [61]:
from sklearn.metrics import f1_score
f1_score(train[target].values, (oof_pred > 0.5) + 0)

0.8648648648648648

In [62]:
oof = oof_pred
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4,0.6,0.01):
    preds = (oof.reshape((-1))>threshold).astype('int')
    m = f1_score(train['black_flag'].values.reshape((-1)), preds, average='macro')   
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold
    print(f'{threshold:.02f}, {m}')
print(f'{best_threshold:.02f}, {best_score}')

0.40, 0.9102985412158333
0.41, 0.9123638153608021
0.42, 0.9133998533874712
0.43, 0.916322068335067
0.44, 0.9150739850958669
0.45, 0.9161207975418415
0.46, 0.9157133908906958
0.47, 0.9165623696287026
0.48, 0.9150898680694286
0.49, 0.9136048941659953
0.50, 0.9121072291804
0.51, 0.9095351339496823
0.52, 0.9082439299830604
0.53, 0.9043494433668996
0.54, 0.9030442528101401
0.55, 0.9041033316199533
0.56, 0.9041033316199533
0.57, 0.8985800208606975
0.58, 0.8969809015369012
0.59, 0.8956449770149066
0.47, 0.9165623696287026


In [63]:
test_result = test[['zhdh', 'black_flag']].copy()
test_result['black_flag'] = y_pred
test_result['black_flag'] = test_result['black_flag'].apply(lambda x: (x > best_threshold)+0)

In [64]:
result = pd.read_csv('./data/test_dataset.csv')
result = result.merge(test_result, how='left', on='zhdh')
result.to_csv('result.csv', index=False)

In [6]:
fea = account_trade.groupby('zhdh').size().reset_index().rename(columns={0: 'trade_cnt'})

In [7]:
tmp = account_trade.groupby(['zhdh', 'jdbj']).size().reset_index()\
    .pivot_table(index='zhdh', columns=['jdbj'], values=[0])
tmp.columns = ['jdbj_0_cnt', 'jdbj_1_cnt']
tmp = tmp.reset_index()

In [8]:
fea = fea.merge(tmp, how='left', on='zhdh')
fea['jdbj_0_pct'] = fea['jdbj_0_cnt'] / fea['trade_cnt']
fea['jdbj_1_pct'] = fea['jdbj_1_cnt'] / fea['trade_cnt']

In [9]:
tmp = account_trade.groupby('zhdh')['jyje'].mean().reset_index().rename(columns={'jyje': 'jyje_mean'})
fea = fea.merge(tmp, how='left', on='zhdh')

tmp = account_trade.groupby('zhdh')['jyje'].std().reset_index().rename(columns={'jyje': 'jyje_std'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [10]:
tmp = account_trade.groupby('zhdh')['jyqd'].nunique().reset_index().rename(columns={'jyqd': 'jyqd_nunique'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [11]:
tmp = account_trade.groupby('zhdh')['dfzh'].nunique().reset_index().rename(columns={'dfzh': 'dfzh_nunique'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [12]:
tmp = account_trade.groupby(['zhdh', 'jdbj'])['jyje'].mean().reset_index()\
    .pivot_table(index='zhdh', columns=['jdbj'], values=['jyje'])
tmp.columns = ['jdbj_0_jyje', 'jdbj_1_jyje']
tmp = tmp.reset_index()
fea = fea.merge(tmp, how='left', on='zhdh')

In [13]:
tmp = account_trade.groupby('zhdh')['dfhh'].nunique().reset_index().rename(columns={'dfhh': 'dfhh_unique_nunique'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [14]:
tmp = account_trade.groupby('zhdh')['dfmccd'].mean().reset_index().rename(columns={'dfmccd': 'dfmccd_mean'})
fea = fea.merge(tmp, how='left', on='zhdh')

tmp = account_trade.groupby('zhdh')['dfmccd'].std().reset_index().rename(columns={'dfmccd': 'dfmccd_std'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [15]:
account_trade['jyrq'] = pd.to_datetime(account_trade['jyrq'])

In [16]:
account_trade['dayofweek'] = account_trade['jyrq'].dt.dayofweek
account_trade['is_wknd'] = account_trade['jyrq'].dt.dayofweek // 6
account_trade['is_month_start'] = account_trade['jyrq'].dt.is_month_start.astype(int)
account_trade['is_month_end'] = account_trade['jyrq'].dt.is_month_end.astype(int)

In [17]:
for col in ['dayofweek', 'is_wknd', 'is_month_start', 'is_month_end']:
    tmp = account_trade.groupby('zhdh')[col].mean().reset_index().rename(columns={col: f'{col}_std'})
    fea = fea.merge(tmp, how='left', on='zhdh')

In [18]:
account_trade['jy_hour'] = account_trade['jysj'].str[:2].astype(int)
tmp = account_trade.groupby('zhdh')['jy_hour'].mean().reset_index().rename(columns={'jy_hour': 'jy_hour_mean'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [19]:
account_trade['jyje_is_int'] = (account_trade['jyje'] == account_trade['jyje'].astype(int)) + 0
tmp = account_trade.groupby('zhdh')['jyje_is_int'].sum() / account_trade.groupby('zhdh').size()
tmp = tmp.reset_index().rename(columns={0: 'jyje_int_pct'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [20]:
tmp = account_trade.groupby('zhdh')['zydh'].nunique().reset_index().rename(columns={'zydh': 'zydh_nunique'})
fea = fea.merge(tmp, how='left', on='zhdh')

In [21]:
for col in ['dfhh', 'jyqd', 'zydh']:
    tmp = account_trade.groupby(['zhdh', col]).size().reset_index().groupby('zhdh')\
        .apply(lambda x: x.sort_values(by=0).iloc[0][col])\
        .reset_index().rename(columns={0: f'most_{col}'})
    fea = fea.merge(tmp, how='left', on='zhdh')

In [22]:
df = account_static.merge(fea, how='left', on='zhdh')

In [23]:
y_train = pd.read_csv('./data/训练集标签.csv')

In [24]:
df = df.merge(y_train, how='left', on='zhdh')

In [25]:
train = df[df['zhdh'].isin(y_train['zhdh'].values)]

In [26]:
test_ids = pd.read_csv('./data/test_dataset.csv')['zhdh'].values

In [27]:
test = df[df['zhdh'].isin(test_ids)]

In [28]:
target = 'black_flag'
features = [c for c in train.columns if c not in [target, 'zhdh', 'khrq']]

In [33]:
from sklearn.model_selection import KFold
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier

In [34]:
FOLDS = 5
folds = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_pred = np.zeros((len(train),))
y_pred = np.zeros((len(test),))

for fold, (tr_ind, val_ind) in enumerate(folds.split(train)):
    x_train, x_val = train.iloc[tr_ind][features], train.iloc[val_ind][features]
    y_train, y_val = train.iloc[tr_ind][target], train.iloc[val_ind][target]
    
    classes = np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))
    
    params = {
        'task_type': 'CPU',
        'bootstrap_type': 'Bayesian',
        'boosting_type': 'Plain',
        'learning_rate': 0.01,
        'eval_metric': 'Logloss',
        'loss_function': 'Logloss',
        'iterations': 10000,
        'random_state': 42,
        'depth': 6,
        'leaf_estimation_iterations': 8,
        'reg_lambda': 5,
        'early_stopping_rounds': 100,
        'class_weights': class_weights,
        'cat_features': ['khjgdh', 'xb', '年龄', 'most_dfhh', 'most_jyqd', 'most_zydh'],
    }
    model = CatBoostClassifier(**params)
    model.fit(x_train,
               y_train,
               eval_set=(x_val, y_val),
               verbose=100)
    oof_pred[val_ind] = model.predict_proba(x_val)[:, 1]
    y_pred += model.predict_proba(test[features])[:, 1] / FOLDS

0:	learn: 0.6878285	test: 0.6880691	best: 0.6880691 (0)	total: 68.7ms	remaining: 11m 27s
100:	learn: 0.4164203	test: 0.4695984	best: 0.4695984 (100)	total: 443ms	remaining: 43.4s
200:	learn: 0.3238651	test: 0.4059339	best: 0.4059339 (200)	total: 811ms	remaining: 39.5s
300:	learn: 0.2726680	test: 0.3751292	best: 0.3751292 (300)	total: 1.25s	remaining: 40.3s
400:	learn: 0.2361264	test: 0.3582961	best: 0.3582961 (400)	total: 1.6s	remaining: 38.4s
500:	learn: 0.2070088	test: 0.3449625	best: 0.3449625 (500)	total: 1.95s	remaining: 37s
600:	learn: 0.1836315	test: 0.3390327	best: 0.3390327 (600)	total: 2.3s	remaining: 36s
700:	learn: 0.1630827	test: 0.3338649	best: 0.3338649 (700)	total: 2.65s	remaining: 35.2s
800:	learn: 0.1452078	test: 0.3311609	best: 0.3310738 (796)	total: 3.02s	remaining: 34.7s
900:	learn: 0.1272554	test: 0.3299845	best: 0.3293977 (877)	total: 3.39s	remaining: 34.3s
1000:	learn: 0.1140597	test: 0.3289071	best: 0.3286708 (970)	total: 3.75s	remaining: 33.7s
Stopped by overf

In [42]:
from sklearn.metrics import f1_score
f1_score(train[target].values, (oof_pred > 0.5) + 0)

0.7824620573355818

In [43]:
test_result = test[['zhdh', 'black_flag']].copy()
test_result['black_flag'] = y_pred
test_result['black_flag'] = test_result['black_flag'].apply(lambda x: (x > 0.5)+0)

In [44]:
result = pd.read_csv('./data/test_dataset.csv')
result = result.merge(test_result, how='left', on='zhdh')
result.to_csv('result.csv', index=False)

In [45]:
result[result['black_flag'] == 0].shape, result[result['black_flag'] == 1].shape

((3595, 2), (1205, 2))

In [46]:
train[train['black_flag'] == 0].shape, train[train['black_flag'] == 1].shape,

((900, 30), (300, 30))

In [42]:
pd.DataFrame(model.feature_importances_, features).sort_values(by=0, ascending=False).loc['last_day_trade_cnt']

0    0.271546
Name: last_day_trade_cnt, dtype: float64