In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import warnings
from datetime import datetime
import lightgbm as lgb

warnings.filterwarnings('ignore')

In [165]:
df = pd.read_pickle('train_total.pkl')
df_ = pd.read_csv('label.csv')
pos_uin = list(df_[df_['label'] == 1]['uin'])
neg_uin = list(df_[df_['label'] == 0]['uin'])
sample_uin = pos_uin + neg_uin[:8728]
df = df[df['uin'].isin(sample_uin)]

In [166]:
f = 'pitch_r'
df_temp = df.groupby(['uin', 'kill_time'])[f'{f}'].agg([
    (f'{f}_range_diff', 'diff'), 
     ])
df[f'{f}_range_diff'] = df_temp[f'{f}_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['uin', 'kill_time', f'{f}_range_diff']]
    df_temp = df__.groupby(['kill_time'])[f'{f}_range_diff'].agg([
    (f'{f}_range_max', 'max'),     
    (f'{f}_range_min', 'min'),
    (f'{f}_range_mean', 'mean'),
    (f'{f}_range_std', 'std'),
    (f'{f}_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df[f'{f}_range_diff']
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [17:17<00:00,  9.64it/s]


In [169]:
f = 'yaw_r'
df_temp = df.groupby(['uin', 'kill_time'])[f'{f}'].agg([
    (f'{f}_range_diff', 'diff'), 
     ])
df[f'{f}_range_diff'] = df_temp[f'{f}_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['uin', 'kill_time', f'{f}_range_diff']]
    df_temp = df__.groupby(['kill_time'])[f'{f}_range_diff'].agg([
    (f'{f}_range_max', 'max'),     
    (f'{f}_range_min', 'min'),
    (f'{f}_range_mean', 'mean'),
    (f'{f}_range_std', 'std'),
    (f'{f}_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df[f'{f}_range_diff']

df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [16:51<00:00,  9.89it/s]


In [171]:
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', 'weapon_id']]
    df_temp = df__.groupby(['kill_time'])['weapon_id'].agg([
        ('weapon_id_nunique', 'nunique'),     
        ])
    df_temp['uin'] = uin
    df_temp = pd.merge(df_temp, df__.drop_duplicates('kill_time'), on=['kill_time'], how='left')
    agg_list.append(df_temp)
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [16:33<00:00, 10.06it/s]


In [172]:
for f in ['yaw_r', 'pitch_r']:
    agg_list = []
    for uin in tqdm(set(df['uin'])):
        df__ = df[df['uin'] == uin][['uin', 'kill_time', f]]
        df_temp = df__.groupby(['kill_time'])[f].agg([
        (f'{f}_max', 'max'),     
        (f'{f}_min', 'min'),
        (f'{f}_mean', 'mean'),
        (f'{f}_std', 'std'),
        (f'{f}_skew', 'skew'),
            ]).reset_index()
        df_temp['uin'] = uin
        agg_list.append(df_temp)
    df_temp = pd.concat(agg_list, axis=0)
    del agg_list
    df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
    del df_temp

100%|████████████████████████████████████| 10000/10000 [16:19<00:00, 10.21it/s]
100%|████████████████████████████████████| 10000/10000 [17:42<00:00,  9.42it/s]


In [173]:
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', 'type', 'index']]
    df__.reset_index(drop=True, inplace=True)
    
    df_temp = df__.groupby(['kill_time'])['type'].agg([
        ('type_nunique', 'nunique'),     
        ])
    df_temp['uin'] = uin
    agg_list.append(df_temp)
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [16:42<00:00,  9.98it/s]


In [174]:
from gensim.models import Word2Vec
import multiprocessing

agg_list = []
target = 'type'
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', target]]
    df__.reset_index(drop=True, inplace=True)
    df__[target] = df__[target].astype('str')
    df__ = df__.groupby('kill_time', as_index=False)[target].agg({'list':(lambda x: list(x))}).reset_index(drop=True)
    agg_list.append(df__)
df_bag = pd.concat(agg_list, axis=0)
doc_list = list(df_bag['list'].values)
w2v = Word2Vec(doc_list, size=10, window=3, min_count=1, workers=multiprocessing.cpu_count())
vocab_keys = list(w2v.wv.vocab.keys())
w2v_array = []
for v in vocab_keys:
    w2v_array.append(list(w2v.wv[v]))
df_w2v = pd.DataFrame()
df_w2v['vocab_keys'] = vocab_keys
df_w2v = pd.concat([df_w2v, pd.DataFrame(w2v_array)], axis=1)
df_w2v.columns = [target] + ['w2v_%s_%d'%(target, x) for x in range(10)]
print('df_w2v:' + str(df_w2v.shape))
del df_bag
df_w2v[target] = df_w2v[target].astype('int8')
df = pd.merge(df, df_w2v, on=target, how='left')

agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time'] +['w2v_%s_%d'%(target, x) for x in range(10)]]
    df__.reset_index(drop=True, inplace=True)
    df_temp = df__.drop_duplicates('kill_time')[['kill_time']]
    for i in range(10):
        d = df__.groupby(['kill_time'])['w2v_%s_%d'%(target, i)].agg([
            (f'{target}_w2v_mean_{i}', 'mean'),     
            ])
        df_temp = pd.merge(df_temp, d, on='kill_time', how='left')
    df_temp['uin'] = uin
    agg_list.append(df_temp)
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [17:25<00:00,  9.56it/s]


df_w2v:(8, 11)


100%|████████████████████████████████████| 10000/10000 [24:10<00:00,  6.89it/s]


In [183]:
df_w2v.to_csv('type_w2v.csv', index=False)

In [180]:
df_temp = df_.groupby('uin')['kill_time'].agg([
        ('kill_time_max', 'max'),
        ('kill_time_min', 'min'),    
        ('kill_time_range_max', lambda x: x.diff().max()), 
        ('kill_time_range_min', lambda x: x.diff().min()),
        ('kill_time_range_mean', lambda x: x.diff().mean()),
        ('kill_time_range_std', lambda x: x.diff().std()),
        ('kill_time_range_skew', lambda x: x.diff().skew()),
        ])
df_ = pd.merge(df_, df_temp, on='uin', how='left')
df_['kill_time_diff'] = df_['kill_time_max'] - df_['kill_time_min']
df_['kill_time_ratio'] = df_['kill_time_diff'] / df_['kill_count']
del df_temp

In [184]:
df['yaw_difference'] = df['yaw'] - df['yaw_r']
f = 'yaw'
df_temp = df.groupby(['uin', 'kill_time'])[f'{f}'].agg([
    (f'{f}_range_diff', 'diff'), 
     ])
df[f'{f}_range_diff'] = df_temp[f'{f}_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['uin', 'kill_time', f'{f}_range_diff']]
    df_temp = df__.groupby(['kill_time'])[f'{f}_range_diff'].agg([
    (f'{f}_range_max', 'max'),     
    (f'{f}_range_min', 'min'),
    (f'{f}_range_mean', 'mean'),
    (f'{f}_range_std', 'std'),
    (f'{f}_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df[f'{f}_range_diff']

df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [18:03<00:00,  9.23it/s]


In [187]:
df_.columns

Index(['uin', 'label', 'kill_count', 'kill_time', 'pitch_r_range_max',
       'pitch_r_range_min', 'pitch_r_range_mean', 'pitch_r_range_std',
       'pitch_r_range_skew', 'yaw_r_range_max', 'yaw_r_range_min',
       'yaw_r_range_mean', 'yaw_r_range_std', 'yaw_r_range_skew',
       'weapon_id_nunique', 'weapon_id', 'yaw_r_max', 'yaw_r_min',
       'yaw_r_mean', 'yaw_r_std', 'yaw_r_skew', 'pitch_r_max', 'pitch_r_min',
       'pitch_r_mean', 'pitch_r_std', 'pitch_r_skew', 'type_nunique',
       'type_w2v_mean_0', 'type_w2v_mean_1', 'type_w2v_mean_2',
       'type_w2v_mean_3', 'type_w2v_mean_4', 'type_w2v_mean_5',
       'type_w2v_mean_6', 'type_w2v_mean_7', 'type_w2v_mean_8',
       'type_w2v_mean_9', 'kill_time_max', 'kill_time_min',
       'kill_time_range_max', 'kill_time_range_min', 'kill_time_range_mean',
       'kill_time_range_std', 'kill_time_range_skew', 'kill_time_diff',
       'kill_time_ratio', 'yaw_range_max', 'yaw_range_min', 'yaw_range_mean',
       'yaw_range_std', 'yaw_ra

In [186]:
for f in ['yaw', 'yaw_difference']:
    agg_list = []
    for uin in tqdm(set(df['uin'])):
        df__ = df[df['uin'] == uin][['uin', 'kill_time', f]]
        df_temp = df__.groupby(['kill_time'])[f].agg([
        (f'{f}_max', 'max'),     
        (f'{f}_min', 'min'),
        (f'{f}_mean', 'mean'),
        (f'{f}_std', 'std'),
        (f'{f}_skew', 'skew'),
            ]).reset_index()
        df_temp['uin'] = uin
        agg_list.append(df_temp)
    df_temp = pd.concat(agg_list, axis=0)
    del agg_list
    df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
    del df_temp

100%|████████████████████████████████████| 10000/10000 [17:32<00:00,  9.50it/s]
100%|████████████████████████████████████| 10000/10000 [17:48<00:00,  9.36it/s]


In [188]:
df_.to_pickle('train_10000_2.pkl')

In [182]:
import gc
gc.collect()

305