In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import warnings
from datetime import datetime
import lightgbm as lgb

warnings.filterwarnings('ignore')

In [2]:
dtype={
    1:'category',
    2:"float32",
    3:"float32",
    4:"uint16",
    5:"int32",
    6:"int32",
    7:"uint32",
    8:"float32",
    9:"float32",
    10:"float32",
    11:"float32",
    12:"uint8",
    13:"uint64",
}
def get_df(f):
    files = []
    for chunk in tqdm(pd.read_csv(f, sep='\|', header=None, usecols=range(1, 14), chunksize=1000000, dtype=dtype)):
        files.append(chunk)
    return files
chunk = get_df('cp_rawdata_0512_4.txt')

df = pd.concat(chunk,axis=0)
del chunk

14it [01:58,  8.48s/it]


In [3]:
col_names = ['uin', 'kill_time', 'time_new', 'index', 'deltaX', 'deltaY', 
             'button', 'pitch', 'yaw','pitch_r', 'yaw_r', 'type', 'weapon_id']
df.columns = col_names
df['uin'] = df['uin'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13377187 entries, 0 to 13377186
Data columns (total 13 columns):
 #   Column     Dtype   
---  ------     -----   
 0   uin        category
 1   kill_time  float32 
 2   time_new   float32 
 3   index      uint16  
 4   deltaX     int32   
 5   deltaY     int32   
 6   button     uint32  
 7   pitch      float32 
 8   yaw        float32 
 9   pitch_r    float32 
 10  yaw_r      float32 
 11  type       uint8   
 12  weapon_id  uint64  
dtypes: category(1), float32(6), int32(2), uint16(1), uint32(1), uint64(1), uint8(1)
memory usage: 625.2 MB


In [15]:
d = df.copy()
df['uin'] = df['uin'].astype('str')
df['kill_time'] = df['kill_time'].astype('str')
d['uin_time'] = df['uin'] + df['kill_time']
d.sort_values('uin_time', inplace=True)
for f in tqdm(['deltaX', 'deltaY', 'yaw', 'pitch_r', 'yaw_r']):
    t = d.groupby('uin_time')[f].rolling(window=10, center=True)
    d[f'rolling_mean_{f}']= t.mean().values
    d[f'rolling_max_{f}']= t.max().values
    d[f'rolling_min_{f}']= t.min().values
    d[f'rolling_std_{f}']= t.std().values
    t = d.groupby('uin_time')[f].rolling(window=20, center=True)
    d[f'rolling_mean_{f}_20']= t.mean().values
    d[f'rolling_max_{f}_20']= t.max().values
    d[f'rolling_min_{f}_20']= t.min().values
    d[f'rolling_std_{f}_20']= t.std().values
    t = d.groupby('uin_time')[f].rolling(window=30, center=True)
    d[f'rolling_mean_{f}_30']= t.mean().values
    d[f'rolling_max_{f}_30']= t.max().values
    d[f'rolling_min_{f}_30']= t.min().values
    d[f'rolling_std_{f}_30']= t.std().values
idx_max_list = d.groupby('uin_time')['type'].idxmax()
df_temp = d.loc[idx_max_list]

100%|███████████████████████████████████████████| 5/5 [15:29<00:00, 185.95s/it]


In [23]:
df_temp.drop(['time_new', 'index', 'deltaX', 'deltaY', 'button',
       'pitch', 'yaw', 'pitch_r', 'yaw_r', 'type', 'weapon_id', 'uin_time'], axis=1).to_pickle('rolling_feature_test.pkl')

In [5]:
df['yaw_difference'] = df['yaw'] - df['yaw_r']
df_ = pd.read_csv('测试集玩家名单.txt', sep='\|', header=None)
df_.columns = ['uin']
df_.head()

Unnamed: 0,uin
0,55f56b53f31dab28ca80aceb5a926945
1,4bbcb5adc84ba8d74df305253747f0e8
2,4a4d1876ebf7bf7753570410b839d3dd
3,8a740dbed086c66ae2683474535a09a2
4,54b9aafdb368d69665c6faa46d0e1156


In [6]:
f = 'pitch_r'
df_temp = df.groupby(['uin', 'kill_time'])[f'{f}'].agg([
    (f'{f}_range_diff', 'diff'), 
     ])
df[f'{f}_range_diff'] = df_temp[f'{f}_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['uin', 'kill_time', f'{f}_range_diff']]
    df_temp = df__.groupby(['kill_time'])[f'{f}_range_diff'].agg([
    (f'{f}_range_max', 'max'),     
    (f'{f}_range_min', 'min'),
    (f'{f}_range_mean', 'mean'),
    (f'{f}_range_std', 'std'),
    (f'{f}_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df[f'{f}_range_diff']
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin'], how='left')
del df_temp

100%|██████████████████████████████████████| 2000/2000 [00:51<00:00, 38.84it/s]


In [7]:
f = 'yaw_r'
df_temp = df.groupby(['uin', 'kill_time'])[f'{f}'].agg([
    (f'{f}_range_diff', 'diff'), 
     ])
df[f'{f}_range_diff'] = df_temp[f'{f}_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['uin', 'kill_time', f'{f}_range_diff']]
    df_temp = df__.groupby(['kill_time'])[f'{f}_range_diff'].agg([
    (f'{f}_range_max', 'max'),     
    (f'{f}_range_min', 'min'),
    (f'{f}_range_mean', 'mean'),
    (f'{f}_range_std', 'std'),
    (f'{f}_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df[f'{f}_range_diff']

df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|██████████████████████████████████████| 2000/2000 [00:52<00:00, 38.45it/s]


In [8]:
f = 'yaw'
df_temp = df.groupby(['uin', 'kill_time'])[f'{f}'].agg([
    (f'{f}_range_diff', 'diff'), 
     ])
df[f'{f}_range_diff'] = df_temp[f'{f}_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['uin', 'kill_time', f'{f}_range_diff']]
    df_temp = df__.groupby(['kill_time'])[f'{f}_range_diff'].agg([
    (f'{f}_range_max', 'max'),     
    (f'{f}_range_min', 'min'),
    (f'{f}_range_mean', 'mean'),
    (f'{f}_range_std', 'std'),
    (f'{f}_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df[f'{f}_range_diff']

df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|██████████████████████████████████████| 2000/2000 [00:53<00:00, 37.43it/s]


In [9]:
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', 'weapon_id']]
    df_temp = df__.groupby(['kill_time'])['weapon_id'].agg([
        ('weapon_id_nunique', 'nunique'),     
        ])
    df_temp['uin'] = uin
    df_temp = pd.merge(df_temp, df__.drop_duplicates('kill_time'), on=['kill_time'], how='left')
    agg_list.append(df_temp)
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|██████████████████████████████████████| 2000/2000 [00:49<00:00, 40.57it/s]


In [10]:
for f in ['yaw_r', 'pitch_r', 'yaw', 'yaw_difference']:
    agg_list = []
    for uin in tqdm(set(df['uin'])):
        df__ = df[df['uin'] == uin][['uin', 'kill_time', f]]
        df_temp = df__.groupby(['kill_time'])[f].agg([
        (f'{f}_max', 'max'),     
        (f'{f}_min', 'min'),
        (f'{f}_mean', 'mean'),
        (f'{f}_std', 'std'),
        (f'{f}_skew', 'skew'),
            ]).reset_index()
        df_temp['uin'] = uin
        agg_list.append(df_temp)
    df_temp = pd.concat(agg_list, axis=0)
    del agg_list
    df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
    del df_temp

100%|██████████████████████████████████████| 2000/2000 [00:51<00:00, 38.86it/s]
100%|██████████████████████████████████████| 2000/2000 [00:52<00:00, 38.26it/s]
100%|██████████████████████████████████████| 2000/2000 [00:52<00:00, 38.33it/s]
100%|██████████████████████████████████████| 2000/2000 [00:52<00:00, 38.06it/s]


In [11]:
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', 'type', 'index']]
    df__.reset_index(drop=True, inplace=True)
    
    df_temp = df__.groupby(['kill_time'])['type'].agg([
        ('type_nunique', 'nunique'),     
        ])
    df_temp['uin'] = uin
    agg_list.append(df_temp)
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|██████████████████████████████████████| 2000/2000 [00:41<00:00, 48.37it/s]


In [12]:
target = 'type'
df_w2v = pd.read_csv('type_w2v.csv')
df = pd.merge(df, df_w2v, on=target, how='left')

agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time'] +['w2v_%s_%d'%(target, x) for x in range(10)]]
    df__.reset_index(drop=True, inplace=True)
    df_temp = df__.drop_duplicates('kill_time')[['kill_time']]
    for i in range(10):
        d = df__.groupby(['kill_time'])['w2v_%s_%d'%(target, i)].agg([
            (f'{target}_w2v_mean_{i}', 'mean'),     
            ])
        df_temp = pd.merge(df_temp, d, on='kill_time', how='left')
    df_temp['uin'] = uin
    agg_list.append(df_temp)
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|██████████████████████████████████████| 2000/2000 [02:10<00:00, 15.31it/s]


In [13]:
df_['kill_count'] = df.groupby('uin')['kill_time'].transform('nunique')
df_temp = df_.groupby('uin')['kill_time'].agg([
    ('kill_time_max', 'max'),
    ('kill_time_min', 'min'),    
    ('kill_time_range_max', lambda x: x.diff().max()), 
    ('kill_time_range_min', lambda x: x.diff().min()),
    ('kill_time_range_mean', lambda x: x.diff().mean()),
    ('kill_time_range_std', lambda x: x.diff().std()),
    ('kill_time_range_skew', lambda x: x.diff().skew()),
        ])
df_ = pd.merge(df_, df_temp, on='uin', how='left')
del df_temp

In [14]:
df_['kill_time_diff'] = df_['kill_time_max'] - df_['kill_time_min']
df_['kill_time_ratio'] = df_['kill_time_diff'] / df_['kill_count']

In [18]:
df_.columns

Index(['uin', 'kill_time', 'pitch_r_range_max', 'pitch_r_range_min',
       'pitch_r_range_mean', 'pitch_r_range_std', 'pitch_r_range_skew',
       'yaw_r_range_max', 'yaw_r_range_min', 'yaw_r_range_mean',
       'yaw_r_range_std', 'yaw_r_range_skew', 'yaw_range_max', 'yaw_range_min',
       'yaw_range_mean', 'yaw_range_std', 'yaw_range_skew',
       'weapon_id_nunique', 'weapon_id', 'yaw_r_max', 'yaw_r_min',
       'yaw_r_mean', 'yaw_r_std', 'yaw_r_skew', 'pitch_r_max', 'pitch_r_min',
       'pitch_r_mean', 'pitch_r_std', 'pitch_r_skew', 'yaw_max', 'yaw_min',
       'yaw_mean', 'yaw_std', 'yaw_skew', 'yaw_difference_max',
       'yaw_difference_min', 'yaw_difference_mean', 'yaw_difference_std',
       'yaw_difference_skew', 'type_nunique', 'type_w2v_mean_0',
       'type_w2v_mean_1', 'type_w2v_mean_2', 'type_w2v_mean_3',
       'type_w2v_mean_4', 'type_w2v_mean_5', 'type_w2v_mean_6',
       'type_w2v_mean_7', 'type_w2v_mean_8', 'type_w2v_mean_9', 'kill_count',
       'kill_time_max', 

# Feature1

In [19]:
df_temp = df.groupby(['uin', 'kill_time'])['deltaX'].agg([
    ('deltaX_range_diff', 'diff'), 
     ])
df['deltaX_range_diff'] = df_temp['deltaX_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['uin', 'kill_time', 'deltaX_range_diff']]
    df_temp = df__.groupby(['kill_time'])['deltaX_range_diff'].agg([
    ('deltaX_range_max', 'max'),     
    ('deltaX_range_min', 'min'),
    ('deltaX_range_mean', 'mean'),
    ('deltaX_range_std', 'std'),
    ('deltaX_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df['deltaX_range_diff']

df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|██████████████████████████████████████| 2000/2000 [00:52<00:00, 37.81it/s]


In [27]:
df_temp = df.groupby(['uin', 'kill_time'])['deltaY'].agg([
    ('deltaY_range_diff', 'diff'), 
     ])
df['deltaY_range_diff'] = df_temp['deltaY_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', 'deltaY_range_diff']]
    df_temp = df__.groupby(['kill_time'])['deltaY_range_diff'].agg([
        ('deltaY_range_max', 'max'),     
            ('deltaY_range_min', 'min'),
    ('deltaY_range_mean', 'mean'),
    ('deltaY_range_std', 'std'),
    ('deltaY_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df['deltaY_range_diff']  

df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|██████████████████████████████████████| 2000/2000 [00:52<00:00, 38.03it/s]


In [29]:
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', 'button']]
    df_temp = df__.groupby(['kill_time'])['button'].agg([
        ('button_nunique', 'nunique'),     
        ])
    df_temp['uin'] = uin
    agg_list.append(df_temp)
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|██████████████████████████████████████| 2000/2000 [00:41<00:00, 48.48it/s]


In [36]:
target = 'button'
df_w2v = pd.read_csv('button_w2v.csv')
df = pd.merge(df, df_w2v, on=target, how='left')

agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time'] +['w2v_%s_%d'%(target, x) for x in range(10)]]
    df__.reset_index(drop=True, inplace=True)
    df_temp = df__.drop_duplicates('kill_time')[['kill_time']]
    for i in range(10):
        d = df__.groupby(['kill_time'])['w2v_%s_%d'%(target, i)].agg([
            (f'{target}_w2v_mean_{i}', 'mean'),     
            ])
        df_temp = pd.merge(df_temp, d, on='kill_time', how='left')
    df_temp['uin'] = uin
    agg_list.append(df_temp)
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|██████████████████████████████████████| 2000/2000 [02:05<00:00, 15.98it/s]


In [30]:
df['deltaXY'] = df.apply(lambda x: math.sqrt(x['deltaX'] ** 2 + x['deltaY'] ** 2), axis=1)

for f in ['deltaX', 'deltaY', 'deltaXY']:
    agg_list = []
    for uin in tqdm(set(df['uin'])):
        df__ = df[df['uin'] == uin][['uin', 'kill_time', f]]
        df_temp = df__.groupby(['kill_time'])[f].agg([
        (f'{f}_max', 'max'),     
        (f'{f}_min', 'min'),
        (f'{f}_mean', 'mean'),
        (f'{f}_std', 'std'),
        (f'{f}_skew', 'skew'),
            ]).reset_index()
        df_temp['uin'] = uin
        agg_list.append(df_temp)
    df_temp = pd.concat(agg_list, axis=0)
    del agg_list
    df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
    del df_temp

100%|██████████████████████████████████████| 2000/2000 [00:53<00:00, 37.39it/s]
100%|██████████████████████████████████████| 2000/2000 [00:52<00:00, 38.17it/s]
100%|██████████████████████████████████████| 2000/2000 [00:51<00:00, 38.88it/s]


In [31]:
df_temp = df.groupby(['uin', 'kill_time'])['deltaXY'].agg([
    ('deltaXY_range_diff', 'diff'), 
     ])
df['deltaXY_range_diff'] = df_temp['deltaXY_range_diff']
agg_list = []
for uin in tqdm(set(df['uin'])):
    df__ = df[df['uin'] == uin][['kill_time', 'deltaXY_range_diff']]
    df_temp = df__.groupby(['kill_time'])['deltaXY_range_diff'].agg([
        ('deltaXY_range_max', 'max'),     
            ('deltaXY_range_min', 'min'),
    ('deltaXY_range_mean', 'mean'),
    ('deltaXY_range_std', 'std'),
    ('deltaXY_range_skew', 'skew'),
        ]).reset_index()
    df_temp['uin'] = uin
    agg_list.append(df_temp)
del df['deltaXY_range_diff']
df_temp = pd.concat(agg_list, axis=0)
del agg_list
df_ = pd.merge(df_, df_temp, on=['uin', 'kill_time'], how='left')
del df_temp

100%|██████████████████████████████████████| 2000/2000 [00:52<00:00, 38.31it/s]


In [33]:
len(df_.columns)

91

In [37]:
df_.to_pickle('test.pkl')