In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import warnings
from datetime import datetime
import lightgbm as lgb

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_pickle('train_16159.pkl')
df_ = pd.read_csv('label.csv')
pos_uin = list(df_[df_['label'] == 1]['uin'])
neg_uin = list(df_[df_['label'] == 0]['uin'])
sample_uin = pos_uin + neg_uin[:8728]
# sample_uin = pos_uin[:50] + neg_uin[:50]
df = df[df['uin'].isin(sample_uin)]

In [24]:
df_temp = df.groupby(['uin', 'kill_time'])['deltaX'].agg([
    ('deltaX_range_diff', 'diff'), 
     ])
df['deltaX_range_diff'] = df_temp['deltaX_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['uin', 'kill_time', 'deltaX_range_diff']]
    df_temp = df__.groupby(['kill_time'])['deltaX_range_diff'].agg([
    ('deltaX_range_max', 'max'),     
    ('deltaX_range_min', 'min'),
    ('deltaX_range_mean', 'mean'),
    ('deltaX_range_std', 'std'),
    ('deltaX_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df['deltaX_range_diff']

df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [17:53<00:00,  9.32it/s]


In [25]:
df_temp = df.groupby(['uin', 'kill_time'])['deltaY'].agg([
    ('deltaY_range_diff', 'diff'), 
     ])
df['deltaY_range_diff'] = df_temp['deltaY_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', 'deltaY_range_diff']]
    df_temp = df__.groupby(['kill_time'])['deltaY_range_diff'].agg([
        ('deltaY_range_max', 'max'),     
            ('deltaY_range_min', 'min'),
    ('deltaY_range_mean', 'mean'),
    ('deltaY_range_std', 'std'),
    ('deltaY_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df['deltaY_range_diff']   
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [17:27<00:00,  9.55it/s]


In [26]:
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', 'button']]
    df_temp = df__.groupby(['kill_time'])['button'].agg([
        ('button_nunique', 'nunique'),     
        ])
    df_temp['uin'] = uin
    agg_list.append(df_temp)
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [17:05<00:00,  9.76it/s]


In [27]:
from gensim.models import Word2Vec
import multiprocessing

agg_list = []
target = 'button'
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', target]]
    df__.reset_index(drop=True, inplace=True)
    df__[target] = df__[target].astype('str')
    df__ = df__.groupby('kill_time', as_index=False)[target].agg({'list':(lambda x: list(x))}).reset_index(drop=True)
    agg_list.append(df__)
df_bag = pd.concat(agg_list, axis=0)
doc_list = list(df_bag['list'].values)
w2v = Word2Vec(doc_list, size=10, window=3, min_count=1, workers=multiprocessing.cpu_count())
vocab_keys = list(w2v.wv.vocab.keys())
w2v_array = []
for v in vocab_keys:
    w2v_array.append(list(w2v.wv[v]))
df_w2v = pd.DataFrame()
df_w2v['vocab_keys'] = vocab_keys
df_w2v = pd.concat([df_w2v, pd.DataFrame(w2v_array)], axis=1)
df_w2v.columns = [target] + ['w2v_%s_%d'%(target, x) for x in range(10)]
print('df_w2v:' + str(df_w2v.shape))
del df_bag
df_w2v[target] = df_w2v[target].astype('uint32')

df_w2v.to_csv('button_w2v.csv', index=False)
df = pd.merge(df, df_w2v, on=target, how='left')

agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time'] +['w2v_%s_%d'%(target, x) for x in range(10)]]
    df__.reset_index(drop=True, inplace=True)
    df_temp = df__.drop_duplicates('kill_time')[['kill_time']]
    for i in range(10):
        d = df__.groupby(['kill_time'])['w2v_%s_%d'%(target, i)].agg([
            (f'{target}_w2v_mean_{i}', 'mean'),     
            ])
        df_temp = pd.merge(df_temp, d, on='kill_time', how='left')
    df_temp['uin'] = uin
    agg_list.append(df_temp)
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [17:45<00:00,  9.39it/s]


df_w2v:(126, 11)


100%|████████████████████████████████████| 10000/10000 [23:19<00:00,  7.15it/s]


In [30]:
import gc
gc.collect()

220

In [33]:
df.columns

Index(['uin', 'kill_time', 'index', 'deltaX', 'deltaY', 'button',
       'w2v_button_0', 'w2v_button_1', 'w2v_button_2', 'w2v_button_3',
       'w2v_button_4', 'w2v_button_5', 'w2v_button_6', 'w2v_button_7',
       'w2v_button_8', 'w2v_button_9'],
      dtype='object')

In [None]:
df_.to_pickle('train_10000_1.pkl')

In [3]:
df = df[['uin', 'kill_time','deltaX', 'deltaY']]
df['deltaXY'] = df.apply(lambda x: math.sqrt(x['deltaX'] ** 2 + x['deltaY'] ** 2), axis=1)
# df.to_pickle('delta.pkl')
# df = pd.read_pickle('delta.pkl')
# df_ = pd.read_csv('label.csv')

In [8]:
for f in ['deltaX', 'deltaY', 'deltaXY']:
    agg_list = []
    for uin in tqdm(set(df['uin'])):
        df__ = df[df['uin'] == uin][['uin', 'kill_time', f]]
        df_temp = df__.groupby(['kill_time'])[f].agg([
        (f'{f}_max', 'max'),     
        (f'{f}_min', 'min'),
        (f'{f}_mean', 'mean'),
        (f'{f}_std', 'std'),
        (f'{f}_skew', 'skew'),
            ]).reset_index()
        df_temp['uin'] = uin
        agg_list.append(df_temp)
        
    df_temp = pd.concat(agg_list, axis=0)
    del agg_list
    
    df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
    del df_temp

100%|████████████████████████████████████| 10000/10000 [15:15<00:00, 10.93it/s]
100%|████████████████████████████████████| 10000/10000 [16:42<00:00,  9.97it/s]


In [7]:
# df_ = pd.merge(df_, df_temp, on=['uin'], how='left')

In [9]:
df_temp = df.groupby(['uin', 'kill_time'])['deltaXY'].agg([
    ('deltaXY_range_diff', 'diff'), 
     ])
df['deltaXY_range_diff'] = df_temp['deltaXY_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', 'deltaXY_range_diff']]
    df_temp = df__.groupby(['kill_time'])['deltaXY_range_diff'].agg([
        ('deltaXY_range_max', 'max'),     
            ('deltaXY_range_min', 'min'),
    ('deltaXY_range_mean', 'mean'),
    ('deltaXY_range_std', 'std'),
    ('deltaXY_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df['deltaXY_range_diff']
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|████████████████████████████████████| 10000/10000 [17:29<00:00,  9.53it/s]


In [10]:
df_

Unnamed: 0,uin,label,kill_count,kill_time,deltaX_max,deltaX_min,deltaX_mean,deltaX_std,deltaX_skew,deltaY_max,...,deltaXY_max,deltaXY_min,deltaXY_mean,deltaXY_std,deltaXY_skew,deltaXY_range_max,deltaXY_range_min,deltaXY_range_mean,deltaXY_range_std,deltaXY_range_skew
0,0d75e2c05c4893bc3c2a97bbc6e34bb1,0.0,7,16785936.0,8.0,-8.0,0.070000,3.726694,0.194140,2.0,...,8.000000,1.0,3.095272,2.159954,0.718872,3.000000,-4.000000,-6.915085e-04,0.997567,-0.355334
1,0d75e2c05c4893bc3c2a97bbc6e34bb1,0.0,7,16786000.0,22.0,-3.0,4.915000,6.801584,0.631252,2.0,...,22.022716,1.0,6.045591,5.851887,0.819614,12.022716,-11.000000,-6.677796e-03,2.040462,0.089263
2,0d75e2c05c4893bc3c2a97bbc6e34bb1,0.0,7,16786132.0,9.0,-19.0,-2.153333,5.773680,-0.870158,2.0,...,19.026298,1.0,4.501164,4.306829,1.359319,6.984703,-6.984703,0.000000e+00,1.421998,0.388799
3,0d75e2c05c4893bc3c2a97bbc6e34bb1,0.0,7,16786148.0,2.0,-11.0,-1.448333,1.882349,-1.445160,1.0,...,11.045361,0.0,1.850479,1.589530,2.189334,3.974293,-4.045361,-1.502504e-02,0.725260,-0.270014
4,0d75e2c05c4893bc3c2a97bbc6e34bb1,0.0,7,16786870.0,6.0,-27.0,-5.773333,8.471022,-0.581121,3.0,...,27.018512,1.0,7.691726,6.864952,0.871326,11.954639,-11.000000,0.000000e+00,2.486763,0.085357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149243,902398fde7a82f69b9abb6b1c81faba6,1.0,5,16785814.0,12.0,-13.0,-0.685000,3.177410,1.492823,3.0,...,13.341664,0.0,2.515693,2.337819,2.392816,4.472136,-5.083934,-2.060378e-02,0.856651,-0.182353
149244,902398fde7a82f69b9abb6b1c81faba6,1.0,5,16785818.0,7.0,-38.0,-3.323333,8.819586,-2.223616,4.0,...,38.118237,0.0,5.261360,7.938500,2.485767,7.053788,-6.102616,3.706922e-19,1.191021,0.207444
149245,0862ea50378cfd91d0455ed9281f494a,0.0,1,,,,,,,,...,,,,,,,,,,
149246,7f41484edd51b379bf2c7deb360935ac,0.0,1,,,,,,,,...,,,,,,,,,,


In [11]:
df_.to_pickle('train_10000_1_.pkl')

In [20]:
# agg_list = []
# target = 'button'
# for uin in tqdm(set(df['uin'])):
#     df__ = df[df['uin'] == uin][['kill_time', target]]
#     df__.reset_index(drop=True, inplace=True)
#     df__[target] = df__[target].astype('str')
#     df__ = df__.groupby('kill_time', as_index=False)[target].agg({'list':(lambda x: list(x))}).reset_index(drop=True)
#     agg_list.append(df__)
# df_bag = pd.concat(agg_list, axis=0)
# doc_list = [' '.join(i) for i in df_bag['list']]
# tfidf_vector = TfidfVectorizer(min_df=1).fit_transform(doc_list)
# df_tfidf = tfidf_vector.todense()
# print('df_tfidf:' + str(df_tfidf.shape))
# tfidf_columns = [f'tfidf_{target}_{i + 1}' for i in range(df_tfidf.shape[1])]
# df_bag[tfidf_columns] = pd.DataFrame(df_tfidf, index=df_bag.index)
# del df_bag['list']

# svd = TruncatedSVD(random_state=2020,
#                    n_components=10)
# df_bag[[
#     f'svd_{i + 1}' for i in range(svd.n_components)
# ]] = pd.DataFrame(svd.fit_transform(
#     tfidf_vector),
#     index=df_bag.index)

100%|████████████████████████████████████████| 100/100 [00:02<00:00, 39.88it/s]


df_tfidf:(2819, 12)
