In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import warnings
from datetime import datetime
import lightgbm as lgb

warnings.filterwarnings('ignore')

In [2]:
d = pd.read_pickle('train_total.pkl')
df_ = pd.read_csv('label.csv')
pos_uin = list(df_[df_['label'] == 1]['uin'])
neg_uin = list(df_[df_['label'] == 0]['uin'])
sample_uin = neg_uin[8723:18723]
df = d[d['uin'].isin(sample_uin)]
del d

In [3]:
df_temp = df.groupby(['uin', 'kill_time'])['deltaX'].agg([
    ('deltaX_range_diff', 'diff'), 
     ])
df['deltaX_range_diff'] = df_temp['deltaX_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['uin', 'kill_time', 'deltaX_range_diff']]
    df_temp = df__.groupby(['kill_time'])['deltaX_range_diff'].agg([
    ('deltaX_range_max', 'max'),     
    ('deltaX_range_min', 'min'),
    ('deltaX_range_mean', 'mean'),
    ('deltaX_range_std', 'std'),
    ('deltaX_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df['deltaX_range_diff']

df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [14:10<00:00, 11.76it/s]


In [4]:
df_temp = df.groupby(['uin', 'kill_time'])['deltaY'].agg([
    ('deltaY_range_diff', 'diff'), 
     ])
df['deltaY_range_diff'] = df_temp['deltaY_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', 'deltaY_range_diff']]
    df_temp = df__.groupby(['kill_time'])['deltaY_range_diff'].agg([
        ('deltaY_range_max', 'max'),     
            ('deltaY_range_min', 'min'),
    ('deltaY_range_mean', 'mean'),
    ('deltaY_range_std', 'std'),
    ('deltaY_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df['deltaY_range_diff']   
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [14:12<00:00, 11.73it/s]


In [5]:
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', 'button']]
    df_temp = df__.groupby(['kill_time'])['button'].agg([
        ('button_nunique', 'nunique'),     
        ])
    df_temp['uin'] = uin
    agg_list.append(df_temp)
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [13:23<00:00, 12.45it/s]


In [6]:
target = 'button'
df_w2v = pd.read_csv('button_w2v.csv')
df = pd.merge(df, df_w2v, on=target, how='left')

agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time'] +['w2v_%s_%d'%(target, x) for x in range(10)]]
    df__.reset_index(drop=True, inplace=True)
    df_temp = df__.drop_duplicates('kill_time')[['kill_time']]
    for i in range(10):
        d = df__.groupby(['kill_time'])['w2v_%s_%d'%(target, i)].agg([
            (f'{target}_w2v_mean_{i}', 'mean'),     
            ])
        df_temp = pd.merge(df_temp, d, on='kill_time', how='left')
    df_temp['uin'] = uin
    agg_list.append(df_temp)
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [20:36<00:00,  8.09it/s]


In [7]:
df_.to_pickle('train_feature_20000_1.pkl')

In [3]:
deltaX_list = list(df['deltaX'])
deltaY_list = list(df['deltaY'])
XY = [math.sqrt(i ** 2 + j ** 2) for i, j in zip(deltaX_list, deltaY_list)]
df['deltaXY'] = XY
del XY
del deltaX_list
del deltaY_list

In [4]:
for f in ['deltaX', 'deltaY', 'deltaXY']:
    agg_list = []
    for uin in tqdm(set(df['uin'])):
        df__ = df[df['uin'] == uin][['uin', 'kill_time', f]]
        df_temp = df__.groupby(['kill_time'])[f].agg([
        (f'{f}_max', 'max'),     
        (f'{f}_min', 'min'),
        (f'{f}_mean', 'mean'),
        (f'{f}_std', 'std'),
        (f'{f}_skew', 'skew'),
            ]).reset_index()
        df_temp['uin'] = uin
        agg_list.append(df_temp)
        
    df_temp = pd.concat(agg_list, axis=0)
    del agg_list
    if f == 'deltaX':
        df_ = pd.merge(df_, df_temp, on=['uin'], how='left')
    else:
        df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
    del df_temp

100%|████████████████████████████████████| 10000/10000 [15:34<00:00, 10.70it/s]
100%|████████████████████████████████████| 10000/10000 [15:42<00:00, 10.61it/s]
100%|████████████████████████████████████| 10000/10000 [14:52<00:00, 11.21it/s]


In [5]:
df_temp = df.groupby(['uin', 'kill_time'])['deltaXY'].agg([
    ('deltaXY_range_diff', 'diff'), 
     ])
df['deltaXY_range_diff'] = df_temp['deltaXY_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', 'deltaXY_range_diff']]
    df_temp = df__.groupby(['kill_time'])['deltaXY_range_diff'].agg([
        ('deltaXY_range_max', 'max'),     
            ('deltaXY_range_min', 'min'),
    ('deltaXY_range_mean', 'mean'),
    ('deltaXY_range_std', 'std'),
    ('deltaXY_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df['deltaXY_range_diff']
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [15:15<00:00, 10.93it/s]


In [6]:
df_.to_pickle('train_faeture_20000_1_.pkl')