In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import warnings
from datetime import datetime
import lightgbm as lgb

warnings.filterwarnings('ignore')

In [2]:
d = pd.read_pickle('train_total.pkl')
df_ = pd.read_csv('label.csv')
pos_uin = list(df_[df_['label'] == 1]['uin'])
neg_uin = list(df_[df_['label'] == 0]['uin'])
sample_uin = pos_uin + neg_uin[:8723]
df = d[d['uin'].isin(sample_uin)]
del d

In [3]:
df_temp = df.groupby(['uin', 'kill_time'])['deltaX'].agg([
    ('deltaX_range_diff', 'diff'), 
     ])
df['deltaX_range_diff'] = df_temp['deltaX_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['uin', 'kill_time', 'deltaX_range_diff']]
    df_temp = df__.groupby(['kill_time'])['deltaX_range_diff'].agg([
    ('deltaX_range_max', 'max'),     
    ('deltaX_range_min', 'min'),
    ('deltaX_range_mean', 'mean'),
    ('deltaX_range_std', 'std'),
    ('deltaX_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df['deltaX_range_diff']

df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [20:15<00:00,  8.23it/s]


In [7]:
df_temp = df.groupby(['uin', 'kill_time'])['deltaY'].agg([
    ('deltaY_range_diff', 'diff'), 
     ])
df['deltaY_range_diff'] = df_temp['deltaY_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', 'deltaY_range_diff']]
    df_temp = df__.groupby(['kill_time'])['deltaY_range_diff'].agg([
        ('deltaY_range_max', 'max'),     
            ('deltaY_range_min', 'min'),
    ('deltaY_range_mean', 'mean'),
    ('deltaY_range_std', 'std'),
    ('deltaY_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df['deltaY_range_diff']   
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [18:47<00:00,  8.87it/s]


In [8]:
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', 'button']]
    df_temp = df__.groupby(['kill_time'])['button'].agg([
        ('button_nunique', 'nunique'),     
        ])
    df_temp['uin'] = uin
    agg_list.append(df_temp)
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [17:59<00:00,  9.26it/s]


In [9]:
from gensim.models import Word2Vec
import multiprocessing

agg_list = []
target = 'button'
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', target]]
    df__.reset_index(drop=True, inplace=True)
    df__[target] = df__[target].astype('str')
    df__ = df__.groupby('kill_time', as_index=False)[target].agg({'list':(lambda x: list(x))}).reset_index(drop=True)
    agg_list.append(df__)
df_bag = pd.concat(agg_list, axis=0)
doc_list = list(df_bag['list'].values)
w2v = Word2Vec(doc_list, size=10, window=3, min_count=1, workers=multiprocessing.cpu_count())
vocab_keys = list(w2v.wv.vocab.keys())
w2v_array = []
for v in vocab_keys:
    w2v_array.append(list(w2v.wv[v]))
df_w2v = pd.DataFrame()
df_w2v['vocab_keys'] = vocab_keys
df_w2v = pd.concat([df_w2v, pd.DataFrame(w2v_array)], axis=1)
df_w2v.columns = [target] + ['w2v_%s_%d'%(target, x) for x in range(10)]
print('df_w2v:' + str(df_w2v.shape))
del df_bag
df_w2v[target] = df_w2v[target].astype('uint32')

df_w2v.to_csv('button_w2v.csv', index=False)
df = pd.merge(df, df_w2v, on=target, how='left')

agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time'] +['w2v_%s_%d'%(target, x) for x in range(10)]]
    df__.reset_index(drop=True, inplace=True)
    df_temp = df__.drop_duplicates('kill_time')[['kill_time']]
    for i in range(10):
        d = df__.groupby(['kill_time'])['w2v_%s_%d'%(target, i)].agg([
            (f'{target}_w2v_mean_{i}', 'mean'),     
            ])
        df_temp = pd.merge(df_temp, d, on='kill_time', how='left')
    df_temp['uin'] = uin
    agg_list.append(df_temp)
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [18:24<00:00,  9.06it/s]


df_w2v:(126, 11)


100%|████████████████████████████████████| 10000/10000 [25:00<00:00,  6.66it/s]


In [10]:
df_.to_pickle('train_feature_10000_1.pkl')

In [4]:
deltaX_list = list(df['deltaX'])
deltaY_list = list(df['deltaY'])
XY = [math.sqrt(i ** 2 + j ** 2) for i, j in zip(deltaX_list, deltaY_list)]
df['deltaXY'] = XY
del XY
del deltaX_list
del deltaY_list

In [3]:
df = df[['uin', 'kill_time','deltaX', 'deltaY']]
df['deltaXY'] = df.apply(lambda x: math.sqrt(x['deltaX'] ** 2 + x['deltaY'] ** 2), axis=1)
# df.to_pickle('delta.pkl')
# df = pd.read_pickle('delta.pkl')
# df_ = pd.read_csv('label.csv')

In [9]:
for f in ['deltaX', 'deltaY', 'deltaXY']:
    agg_list = []
    for uin in tqdm(set(df['uin'])):
        df__ = df[df['uin'] == uin][['uin', 'kill_time', f]]
        df_temp = df__.groupby(['kill_time'])[f].agg([
        (f'{f}_max', 'max'),     
        (f'{f}_min', 'min'),
        (f'{f}_mean', 'mean'),
        (f'{f}_std', 'std'),
        (f'{f}_skew', 'skew'),
            ]).reset_index()
        df_temp['uin'] = uin
        agg_list.append(df_temp)
        
    df_temp = pd.concat(agg_list, axis=0)
    del agg_list
    if f == 'deltaX':
        df_ = pd.merge(df_, df_temp, on=['uin'], how='left')
    else:
        df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
    del df_temp

100%|████████████████████████████████████| 10000/10000 [19:28<00:00,  8.56it/s]
100%|████████████████████████████████████| 10000/10000 [20:48<00:00,  8.01it/s]
100%|████████████████████████████████████| 10000/10000 [19:34<00:00,  8.51it/s]


In [7]:
# df_ = pd.merge(df_, df_temp, on=['uin'], how='left')

In [10]:
df_temp = df.groupby(['uin', 'kill_time'])['deltaXY'].agg([
    ('deltaXY_range_diff', 'diff'), 
     ])
df['deltaXY_range_diff'] = df_temp['deltaXY_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', 'deltaXY_range_diff']]
    df_temp = df__.groupby(['kill_time'])['deltaXY_range_diff'].agg([
        ('deltaXY_range_max', 'max'),     
            ('deltaXY_range_min', 'min'),
    ('deltaXY_range_mean', 'mean'),
    ('deltaXY_range_std', 'std'),
    ('deltaXY_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df['deltaXY_range_diff']
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [20:47<00:00,  8.02it/s]


In [11]:
df_

Unnamed: 0,uin,label,kill_time,deltaX_max,deltaX_min,deltaX_mean,deltaX_std,deltaX_skew,deltaY_max,deltaY_min,...,deltaXY_max,deltaXY_min,deltaXY_mean,deltaXY_std,deltaXY_skew,deltaXY_range_max,deltaXY_range_min,deltaXY_range_mean,deltaXY_range_std,deltaXY_range_skew
0,0d75e2c05c4893bc3c2a97bbc6e34bb1,0.0,16785936.0,8.0,-8.0,0.070000,3.726694,0.194140,2.0,-1.0,...,8.000000,1.0,3.095272,2.159954,0.718872,3.000000,-4.000000,-0.000692,0.997567,-0.355334
1,0d75e2c05c4893bc3c2a97bbc6e34bb1,0.0,16786000.0,22.0,-3.0,4.915000,6.801584,0.631252,2.0,-1.0,...,22.022716,1.0,6.045591,5.851887,0.819614,12.022716,-11.000000,-0.006678,2.040462,0.089263
2,0d75e2c05c4893bc3c2a97bbc6e34bb1,0.0,16786132.0,9.0,-19.0,-2.153333,5.773680,-0.870158,2.0,-2.0,...,19.026298,1.0,4.501164,4.306829,1.359319,6.984703,-6.984703,0.000000,1.421998,0.388799
3,0d75e2c05c4893bc3c2a97bbc6e34bb1,0.0,16786148.0,2.0,-11.0,-1.448333,1.882349,-1.445160,1.0,-1.0,...,11.045361,0.0,1.850479,1.589530,2.189334,3.974293,-4.045361,-0.015025,0.725260,-0.270014
4,0d75e2c05c4893bc3c2a97bbc6e34bb1,0.0,16786870.0,6.0,-27.0,-5.773333,8.471022,-0.581121,3.0,-3.0,...,27.018512,1.0,7.691726,6.864952,0.871326,11.954639,-11.000000,0.000000,2.486763,0.085357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148597,e21e02b0e4057a83fa287cefc22fa683,1.0,16778162.0,42.0,-30.0,0.271667,9.769012,1.408972,9.0,-7.0,...,42.190046,0.0,6.419203,7.692798,2.273293,14.000000,-7.833630,-0.018980,2.050644,1.059435
148598,e21e02b0e4057a83fa287cefc22fa683,1.0,16778272.0,34.0,-69.0,-4.548333,19.848617,-1.471745,14.0,-11.0,...,69.000000,1.0,13.427241,15.559918,1.922002,32.132096,-32.585786,0.011044,6.289815,-0.020097
148599,e21e02b0e4057a83fa287cefc22fa683,1.0,16778278.0,34.0,-68.0,-1.916667,14.797342,-1.215938,17.0,-16.0,...,68.000000,1.0,10.772141,10.952134,2.132990,32.132096,-32.585786,-0.065109,6.007478,-0.108043
148600,e21e02b0e4057a83fa287cefc22fa683,1.0,16778280.0,22.0,-19.0,-0.068333,3.520083,1.908271,17.0,-16.0,...,26.076810,0.0,3.557309,3.918905,2.702923,23.076810,-23.076810,-0.001940,3.301316,0.423794


In [12]:
df_.to_pickle('train_faeture_10000_1_.pkl')