In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import warnings
from datetime import datetime
import lightgbm as lgb

warnings.filterwarnings('ignore')

In [2]:
dtype={
    1:'category',
    2:"float32",
    4:"uint16",
    5:"int32",
    6:"int32",
    7:"uint32"
}
def get_df(f):
    files = []
    for chunk in tqdm(pd.read_csv(f, sep='\|', header=None, usecols=[1, 2, 4, 5, 6, 7], chunksize=1000000, dtype=dtype)):
        files.append(chunk)
    return files
chunk = get_df('E:\cp_rawdata_0509.txt')

519it [57:08,  6.61s/it]


In [3]:
chunk2 = get_df('cp_rawdata_0509_2.txt')
chunk.extend(chunk2)
df = pd.concat(chunk, axis=0)
del chunk
del chunk2

1it [00:00,  1.21it/s]


In [4]:
col_names = ['uin', 'kill_time', 'index', 'deltaX', 'deltaY', 'button']
df.columns = col_names
df['uin'] = df['uin'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 518489176 entries, 0 to 37799
Data columns (total 6 columns):
 #   Column     Dtype   
---  ------     -----   
 0   uin        category
 1   kill_time  float32 
 2   index      uint16  
 3   deltaX     int32   
 4   deltaY     int32   
 5   button     uint32  
dtypes: category(1), float32(1), int32(2), uint16(1), uint32(1)
memory usage: 14.5 GB


In [2]:
black = pd.read_csv('训练集black_uin.txt', sep='\|', header=None)
white = pd.read_csv('训练集white_uin.txt', sep='\|', header=None)
black.columns = ['uin']
white.columns = ['uin']
black_and_white = pd.merge(black,white,on=['uin'])

In [3]:
len(black_and_white)

875

In [5]:
black_and_white.to_csv('black_and_white.csv', index=False)

In [5]:
black['label'] = 1
white['label'] = 0
black_and_white['label'] = 2

black = black.append(black_and_white)
black = black.drop_duplicates(subset=['uin'],keep=False)

white = white.append(black_and_white)
white = white.drop_duplicates(subset=['uin'],keep=False)

In [19]:
label_map = pd.concat([black, white], axis=0)
label_map = dict(zip(label_map['uin'], label_map['label']))

In [22]:
df_ = df.drop_duplicates('uin')
df_.drop(['kill_time', 'index', 'deltaX', 'deltaY', 'button'], axis=1, inplace=True)
df_['label'] = df_['uin'].map(label_map)
df_['label'].value_counts()

0.0    36036
1.0     1277
Name: label, dtype: int64

In [26]:
df_ = df_[~df_['label'].isnull()]
df_.to_csv('label.csv', index=False)
pos_uin = list(df_[df_['label'] == 1]['uin'])
neg_uin = list(df_[df_['label'] == 0]['uin'])
sample_uin = pos_uin + neg_uin
df = df[df['uin'].isin(sample_uin)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 205183414 entries, 3 to 37799
Data columns (total 6 columns):
 #   Column     Dtype   
---  ------     -----   
 0   uin        category
 1   kill_time  float32 
 2   index      uint16  
 3   deltaX     int32   
 4   deltaY     int32   
 5   button     uint32  
dtypes: category(1), float32(1), int32(2), uint16(1), uint32(1)
memory usage: 5.7 GB


In [28]:
df.sort_values(['uin', 'kill_time', 'index'], inplace=True)
df.to_pickle('train_total.pkl')