In [1]:
import warnings
warnings.simplefilter('ignore')

import gc
import re
import time

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
pd.set_option('float_format', lambda x: '%.6f' % x)

from IPython.display import display
pd.options.display.max_rows = None

from tqdm import tqdm
tqdm.pandas()

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

In [2]:
# 导入数据

train = pd.read_csv('./ds/train1000.csv')
test = pd.read_csv('./ds/test.csv')
df = pd.concat([train, test])
event = pd.read_csv('./ds/event.csv')
df = pd.merge(df, event, on='event_id', how='left')

del train, test, event
gc.collect()

0

In [3]:
# 重命名 columns 区分 event_id 特征
df.rename(columns={'nhit': 'event_id_nhit',
                   'nhitreal': 'event_id_nhitreal',
                   'energymc': 'event_id_energymc',
                   'thetamc': 'event_id_thetamc',
                   'phimc': 'event_id_phimc',
                   'xcmc': 'event_id_xcmc',
                   'ycmc': 'event_id_ycmc'}, inplace=True)
df.head()

              

Unnamed: 0,x,y,z,t,terror,q,flag,event_id,hit_id,event_id_nhit,event_id_nhitreal,event_id_energymc,event_id_thetamc,event_id_phimc,event_id_xcmc,event_id_ycmc
0,-142.5,-147.5,0,767.879,2.02966,1.05052,0.0,7,1,426,70,48348.9,63.1686,11.0982,-40.83,114.03
1,-137.5,-152.5,0,-70.5552,2.02966,0.999853,0.0,7,2,426,70,48348.9,63.1686,11.0982,-40.83,114.03
2,-137.5,-132.5,0,-837.841,1.85146,2.05254,0.0,7,3,426,70,48348.9,63.1686,11.0982,-40.83,114.03
3,-142.5,-117.5,0,-973.195,1.39994,19.5131,0.0,7,4,426,70,48348.9,63.1686,11.0982,-40.83,114.03
4,-137.5,-117.5,0,-159.14,2.02966,0.800334,0.0,7,5,426,70,48348.9,63.1686,11.0982,-40.83,114.03


In [4]:
# t 统计特征
df['event_id_t_min'] = df.groupby('event_id')['t'].transform('min')
df['event_id_t_max'] = df.groupby('event_id')['t'].transform('max')
df['event_id_t_median'] = df.groupby('event_id')['t'].transform('median')
df['event_id_t_mean'] = df.groupby('event_id')['t'].transform('mean')

# t "偏移"时间
df['t_min_diff'] = df['t'] - df['event_id_t_min']
df['t_max_diff'] = df['event_id_t_max'] - df['t']
df['t_median_diff'] = df['event_id_t_median'] - df['t']
df['t_mean_diff'] = df['event_id_t_mean'] - df['t']


In [5]:
                    
# 重新排序, 为后面 t 的 rolling 或者 diff 特征做准备
df = df.sort_values(by=['event_id', 't_min_diff']).reset_index(drop=True)

    

In [6]:
# 时间变化特征, 强特
# 也可以用 rolling 加不同 window_size .std() 来做, 效果比 diff 稍微差一点
# 试过 rolling + diff 效果比只用一种要差, 我还没搞清楚, 可以多尝试

for i in [4, 6, 8, 10, 12]:
    df[f't_diff_last_{i}'] = df.groupby('event_id')['t'].diff(periods=i).fillna(0)

  

In [7]:
# 修正时间, 没太大作用

df['t_minus_terror'] = df['t'] - df['terror']

      

In [8]:
# 位置与中心位置的比例?

df['x_div_xcmc'] = df['x'] / (df['event_id_xcmc'] + 0.01)
df['y_div_ycmc'] = df['y'] / (df['event_id_ycmc'] + 0.01)

   

In [9]:
                    
# 位置的变化特征, 线上 +3 左右

for i in range(1, 21):
    df[f'x_diff_last_{i}'] = df.groupby(['event_id'])['x'].diff(periods=i).fillna(0)
    df[f'y_diff_last_{i}'] = df.groupby(['event_id'])['y'].diff(periods=i).fillna(0)

  

In [10]:
df['x2'] = df['x'] ** 2
df['y2'] = df['y'] ** 2

# 与中心距离的位置变化特征, 线上 +1 左右
df['dis2c'] = ((df['x'] - df['event_id_xcmc'])**2 + (df['y'] - df['event_id_ycmc'])**2)**0.5
for i in range(1, 10):
    df[f'dis2c_diff_last_{i}'] = df.groupby(['event_id'])['dis2c'].diff(periods=i).fillna(0)

  

In [11]:
# 这个特征是比较有用的 event_id 特征

df['event_id_realhit_ratio'] = df['event_id_nhitreal'] / df['event_id_nhit']

  

In [12]:
# freq encoding
# 没太大作用, 线上 +0.1

def freq_enc(df, col):
    vc = df[col].value_counts(dropna=True, normalize=True).to_dict()
    df[f'{col}_freq'] = df[col].map(vc)
    
    return df

df['x_y'] = df['x'].astype('str') + '_' + df['y'].astype('str')
df = freq_enc(df, 'terror')
df = freq_enc(df, 'x_y')


In [13]:
id_and_label = ['event_id', 'hit_id', 'flag']
useless_features = [
    'z', 'x_y',
    'event_id_t_mean', 'event_id_t_median', 'event_id_t_min', 'event_id_t_max',
    'event_id_nhit',
]
use_features = [col for col in df.columns if col not in id_and_label + useless_features]

     

In [14]:
# 伪标签, 还算有点用, 线上 +0.5
# 这个规则是观察 train 数据得出

# t < -900 ==> 0
# t > 1850 ==> 1
# q < 0    ==> 1

test = df[df.flag.isna()]

df.loc[df.flag.isna()&(df.t<-900), 'flag'] = 0
df.loc[df.flag.isna()&((df.t>1850)|(df.q<0)), 'flag'] = 1

train = df[df.flag.notna()]
train['flag'] = train['flag'].astype('int')


In [15]:
del df
gc.collect()

             

60

In [16]:
def run_lgb(df_train, df_test, use_features):
    
    target = 'flag'
    oof_pred = np.zeros((len(df_train), ))
    y_pred = np.zeros((len(df_test), ))
    
    folds = GroupKFold(n_splits=6)  # 6 折比 5 折好一点, 当然有时间有机器可以试下更多的 folds
    for fold, (tr_ind, val_ind) in enumerate(folds.split(train, train[target], train['event_id'])):
        start_time = time.time()
        print(f'Fold {fold + 1}')
        x_train, x_val = df_train[use_features].iloc[tr_ind], df_train[use_features].iloc[val_ind]
        y_train, y_val = df_train[target].iloc[tr_ind], df_train[target].iloc[val_ind]
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)
        
        params = {
            'learning_rate': 0.2,
            'metric': 'auc',
            'objective': 'binary',
            'feature_fraction': 0.80,
            'bagging_fraction': 0.75,
            'bagging_freq': 2,
            'n_jobs': -1,
            'seed': 1029,
            'max_depth': 8,
            'num_leaves': 64,
            'lambda_l1': 0.5,
            'lambda_l2': 0.5
        }
        
        model = lgb.train(params, 
                          train_set, 
                          num_boost_round=5000,
                          early_stopping_rounds=100,
                          valid_sets=[train_set, val_set],
                          verbose_eval=100)
        oof_pred[val_ind] = model.predict(x_val)
        y_pred += model.predict(df_test[use_features]) / folds.n_splits
        
        print("Features importance...")
        gain = model.feature_importance('gain')
        feat_imp = pd.DataFrame({'feature': model.feature_name(), 
                         'split': model.feature_importance('split'), 
                         'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
        
        display(feat_imp)
        
        used_time = (time.time() - start_time) / 3600
        print(f'used_time: {used_time:.2f} hours')
        
        del x_train, x_val, y_train, y_val, train_set, val_set
        gc.collect()
        
    return y_pred, oof_pred

  

In [17]:
y_pred, oof_pred = run_lgb(train, test, use_features)


Fold 1
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 1	valid_1's auc: 0.993861
Early stopping, best iteration is:
[41]	training's auc: 1	valid_1's auc: 0.994102
Features importance...


Unnamed: 0,feature,split,gain
11,t_min_diff,17,63.183817
75,event_id_realhit_ratio,4,25.999419
19,t_diff_last_12,14,3.742637
6,event_id_energymc,8,3.607542
14,t_mean_diff,3,0.959288
13,t_median_diff,15,0.657778
20,t_minus_terror,4,0.593059
2,t,15,0.332494
35,x_diff_last_7,13,0.114469
51,x_diff_last_15,7,0.084941


used_time: 0.01 hours
Fold 2
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 1	valid_1's auc: 0.907376
Early stopping, best iteration is:
[1]	training's auc: 0.999382	valid_1's auc: 0.999457
Features importance...


Unnamed: 0,feature,split,gain
11,t_min_diff,1,83.46571
75,event_id_realhit_ratio,1,13.843233
65,dis2c,1,1.678283
19,t_diff_last_12,1,0.841413
12,t_max_diff,1,0.061985
26,y_diff_last_2,1,0.047369
58,y_diff_last_18,1,0.032115
56,y_diff_last_17,1,0.022926
39,x_diff_last_9,1,0.006966
49,x_diff_last_14,0,0.0


used_time: 0.01 hours
Fold 3
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 1	valid_1's auc: 0.999496
Early stopping, best iteration is:
[37]	training's auc: 0.999998	valid_1's auc: 0.999507
Features importance...


Unnamed: 0,feature,split,gain
14,t_mean_diff,5,57.415948
7,event_id_thetamc,2,29.031202
65,dis2c,14,3.922598
12,t_max_diff,9,2.792266
32,y_diff_last_5,5,1.358837
2,t,29,1.358803
50,y_diff_last_14,15,0.482247
20,t_minus_terror,6,0.26142
11,t_min_diff,28,0.256163
51,x_diff_last_15,14,0.195438


used_time: 0.01 hours
Fold 4
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 1	valid_1's auc: 0.999999
Early stopping, best iteration is:
[29]	training's auc: 1	valid_1's auc: 1
Features importance...


Unnamed: 0,feature,split,gain
11,t_min_diff,16,54.671927
75,event_id_realhit_ratio,2,23.841973
6,event_id_energymc,5,6.921796
12,t_max_diff,10,4.836203
65,dis2c,17,2.930458
14,t_mean_diff,16,1.41386
32,y_diff_last_5,4,0.940261
2,t,17,0.519197
13,t_median_diff,5,0.353897
50,y_diff_last_14,18,0.350035


used_time: 0.01 hours
Fold 5
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 1	valid_1's auc: 1
Early stopping, best iteration is:
[1]	training's auc: 0.999347	valid_1's auc: 1
Features importance...


Unnamed: 0,feature,split,gain
11,t_min_diff,1,66.114842
75,event_id_realhit_ratio,1,28.927486
65,dis2c,1,2.238965
12,t_max_diff,2,2.016941
32,y_diff_last_5,1,0.570091
46,y_diff_last_12,1,0.065728
55,x_diff_last_17,1,0.035353
6,event_id_energymc,1,0.024363
45,x_diff_last_12,1,0.006231
51,x_diff_last_15,0,0.0


used_time: 0.01 hours
Fold 6
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 1	valid_1's auc: 1
Early stopping, best iteration is:
[10]	training's auc: 0.999898	valid_1's auc: 1
Features importance...


Unnamed: 0,feature,split,gain
11,t_min_diff,7,62.082937
75,event_id_realhit_ratio,2,26.90371
65,dis2c,9,2.888157
12,t_max_diff,9,2.283987
16,t_diff_last_6,3,1.133829
21,x_div_xcmc,6,1.008554
30,y_diff_last_4,6,0.279345
6,event_id_energymc,4,0.236385
52,y_diff_last_15,6,0.206301
0,x,2,0.190969


used_time: 0.01 hours


In [18]:
score = roc_auc_score(train['flag'], oof_pred) 
print('auc: ', score)


auc:  0.9844082363281363


In [19]:
np.save(f'lgb_y_pred_{score}', y_pred)
np.save(f'lgb_oof_pred_{score}', oof_pred)

        

In [20]:
best_threshold = 0.35

test['flag_pred'] = y_pred
submission = test[['hit_id', 'flag_pred', 'event_id']]
submission['flag_pred'] = submission['flag_pred'].apply(lambda x: 1 if x > best_threshold else 0)
submission = submission.sort_values(by='hit_id')
submission.to_csv(f'submissions/submission_lgb_{score}_threshold_{best_threshold}.csv', index=False)  # 线上 54.600440509
submission.flag_pred.value_counts()

      

FileNotFoundError: [Errno 2] No such file or directory: 'submissions/submission_lgb_0.9844082363281363_threshold_0.35.csv'