In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import warnings
from datetime import datetime
import lightgbm as lgb

warnings.filterwarnings('ignore')

In [2]:
df1_ = pd.read_pickle('train_feature_10000_1.pkl')
df1 = pd.read_pickle('train_faeture_10000_1_.pkl')
df2_ = pd.read_pickle('train_feature_20000_1.pkl')
df2 = pd.read_pickle('train_faeture_20000_1_.pkl')
df3 = pd.read_pickle('train_faeture_30000_1.pkl')
df4 = pd.read_pickle('train_faeture_40000_1.pkl')
df1 = pd.concat([df1_, df1.drop(['uin','label', 'kill_time'], axis=1), ], axis=1)
df2 = pd.concat([df2_, df2.drop(['uin','label', 'kill_time'], axis=1), ], axis=1)
df_feature1 = pd.concat([df1, df2, df3, df4], axis=0)
df1 = pd.read_pickle('train_feature_10000_2.pkl')
df2 = pd.read_pickle('train_feature_20000_2.pkl')
df3 = pd.read_pickle('train_feature_30000_2.pkl')
df4 = pd.read_pickle('train_feature_40000_2.pkl')
df_feature2 = pd.concat([df1, df2, df3, df4], axis=0)
df = pd.concat([df_feature1, df_feature2.drop(['uin','label',  'kill_time'], axis=1)], axis=1)
df.dropna(axis=0, subset=['kill_time'], inplace=True)

In [4]:
y = df['label']
X_train = df.copy()
X_train['weapon_id'] =  X_train['weapon_id'] .astype('category')
features = X_train.columns
features = features.drop(['uin','label'])
len(features)

100

In [5]:
X_test = pd.read_pickle('test.pkl')
X_test['weapon_id'] =  X_test['weapon_id'] .astype('category')
KF = GroupKFold(n_splits=5)
params = {
          'objective':'binary',
          'metric':'auc', 
          'learning_rate':0.05, 
          'subsample':0.8, 
          'subsample_freq':3, 
          'colsample_btree':0.8,
          'num_iterations': 10000, 
          'silent':True
}

oof_lgb = np.zeros(len(X_train))
predictions_lgb = np.zeros((len(X_test)))

# 五折交叉验证
for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y.values, X_train['uin'])):
    print("fold n°{}".format(fold_))
    print('trn_idx:',trn_idx)
    print('val_idx:',val_idx)
    trn_data = lgb.Dataset(X_train.iloc[trn_idx][features],label=y.iloc[trn_idx])    
    val_data = lgb.Dataset(X_train.iloc[val_idx][features],label=y.iloc[val_idx])
    num_round = 10000
    clf = lgb.train(
        params,
        trn_data,
        num_round,
        valid_sets = [trn_data, val_data],
        verbose_eval=500,
        early_stopping_rounds=200,  
        categorical_feature=['weapon_id'],    
    )       
    oof_lgb[val_idx] = clf.predict(X_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions_lgb[:] += clf.predict(X_test[features], num_iteration=clf.best_iteration) 
print("AUC score: {}".format(roc_auc_score(y, oof_lgb)))
print("F1 score: {}".format(f1_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
print("Precision score: {}".format(precision_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
print("Recall score: {}".format(recall_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))

fold n°0
trn_idx: [     7      8      9 ... 294806 294807 294808]
val_idx: [     0      1      2 ... 294783 294801 294805]
Training until validation scores don't improve for 200 rounds
[500]	training's auc: 0.99892	valid_1's auc: 0.914525
Early stopping, best iteration is:
[561]	training's auc: 0.999251	valid_1's auc: 0.916082
fold n°1
trn_idx: [     0      1      2 ... 294806 294807 294808]
val_idx: [    67     68     69 ... 294791 294797 294802]
Training until validation scores don't improve for 200 rounds
[500]	training's auc: 0.998762	valid_1's auc: 0.920926
[1000]	training's auc: 0.999978	valid_1's auc: 0.925033
Early stopping, best iteration is:
[987]	training's auc: 0.999976	valid_1's auc: 0.925112
fold n°2
trn_idx: [     0      1      2 ... 294805 294806 294808]
val_idx: [    10     11     12 ... 294796 294800 294807]
Training until validation scores don't improve for 200 rounds
[500]	training's auc: 0.999026	valid_1's auc: 0.898349
Early stopping, best iteration is:
[597]	trai

In [6]:
X_test['oof'] = predictions_lgb / 5
X_test[['uin', 'kill_time', 'oof']].to_pickle('test_oof.pkl')
X_train['oof'] = oof_lgb
X_train.sort_values(['uin', 'oof'], inplace=True)
X_train_pos = X_train[X_train['label'] == 1]
X_train_neg = X_train[X_train['label'] == 0]
X_train_pos.drop_duplicates('uin', keep='last', inplace=True)
X_train_neg.drop_duplicates('uin', keep='first', inplace=True)
X_train = pd.concat([X_train_pos, X_train_neg], axis=0)
X_train.to_pickle('train_37313.pkl')