In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sys
sys.path.append("D:/AnacondaSpace/JupyterSpace/")

import gc

import datetime
import time
from sklearn.preprocessing import LabelEncoder
import pickle

In [2]:
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False

In [3]:
DATA_PATH = "G:/RecommdationDataset/AIStudio/antiFake/"

In [4]:
raw_data = pd.read_csv(DATA_PATH + "train.csv", index_col=0)

In [5]:
def reduce_mem_usage(df, use_uint=True, verbose=True):
    """
    节约内存函数
    :param df: 原始从文件中读取出来的df数据
    :param use_uint: 是否使用无符号整型处理数据
    :param verbose: 是否打印输出处理前后内存占用情况
    :return: 处理后的df
    """
    numerics = ['uint8', 'uint16', 'uint32', 'uint64',
                'int8', 'int16', 'int32', 'int64', 
                'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if 'int' in str(col_type):
                if use_uint and c_min >= 0:  # uint类型
                    if c_max <= np.iinfo(np.uint8).max:
                        df[col] = df[col].astype(np.uint8)
                    elif c_max <= np.iinfo(np.uint16).max:
                        df[col] = df[col].astype(np.uint16)
                    elif c_max <= np.iinfo(np.uint32).max:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:  # signed int类型
                    if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    else:
                        df[col] = df[col].astype(np.int64)  
            else:  # float类型
                if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('StartMem:{:.2f}Mb, EndMem:{:.2f}Mb ({:.1f}% reduction)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [6]:
raw_data = reduce_mem_usage(raw_data, use_uint=False)

StartMem:80.11Mb, EndMem:42.44Mb (47.0% reduction)


## 数据基本情况查看

In [10]:
raw_data.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,lan,media_id,ntt,os,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type
0,316361,1199,46016.0,0.0,0.0,0.0,1,,104,6.0,android,9,18,1438873,1559893000000.0,8,2135019403,0,2329670524,601
1,135939,893,0.0,0.0,0.0,0.0,1,,19,6.0,android,8.1,0,1185582,1559994000000.0,4,2782306428,1,2864801071,1000
2,399254,821,0.0,760.0,0.0,360.0,1,,559,0.0,android,8.1.0,0,1555716,1559837000000.0,0,1392806005,2,628911675,696
3,68983,1004,46016.0,2214.0,0.0,1080.0,0,,129,2.0,android,8.1.0,0,1093419,1560042000000.0,0,3562553457,3,1283809327,753
4,288999,1076,46016.0,2280.0,0.0,1080.0,1,zh-CN,64,2.0,android,8.0.0,0,1400089,1559867000000.0,5,2364522023,4,1510695983,582


In [11]:
raw_data.shape

(500000, 20)

### 查看正负样本情况

In [15]:
pos_data = raw_data.loc[raw_data.label == 0]
neg_data = raw_data.loc[raw_data.label == 1]

In [16]:
print(pos_data.shape, neg_data.shape)

(257760, 20) (242240, 20)


In [17]:
257760 / (242240 + 257760)

0.51552

### 查看语言情况

In [19]:
pos_data.lan.unique()

array([nan, 'zh-CN', 'zh', 'cn', 'Zh-CN', 'zh_CN', 'zh-cn', 'ZH', 'tw',
       'CN', 'en-GB', 'TW', 'zh_CN_#Hans', 'en', 'zh-HK', 'en-US',
       'zh-TW', 'mi'], dtype=object)

In [18]:
neg_data.lan.unique()

array([nan, 'zh-CN', 'zh-cn', 'cn', 'zh_CN', 'zh', 'Zh-CN', 'tw', 'en',
       'ZH', 'CN', 'zh_CN_#Hans', 'ko', 'zh-MO', 'it', 'ja', 'en-US',
       'zh-TW'], dtype=object)

In [20]:
# 正负样本中语言缺失值数量
neg_na_num = neg_data.loc[neg_data.lan.isna()].shape[0]
pos_na_num = pos_data.loc[pos_data.lan.isna()].shape[0]

In [21]:
neg_na_num

44661

In [22]:
pos_na_num

138619

In [23]:
44661 / 242240

0.18436674372523118

In [24]:
138619 / 257760

0.5377832091868404

### 负样本中语言设为nan的只占18%，而正样本中语言设为nan的则占比超过50%

In [25]:
# 查看系统分布
pos_data.os.unique()

array(['android', 'Android'], dtype=object)

In [26]:
neg_data.os.unique()

array(['android', 'Android'], dtype=object)

In [28]:
pos_data.loc[pos_data.os == "android"].shape[0] / pos_data.shape[0]

0.7018272811918064

In [29]:
neg_data.loc[neg_data.os == "android"].shape[0] / neg_data.shape[0]

0.5047556142668428

## 合并训练数据和测试数据，之后进行LabelEncoder

In [7]:
init_dtype = {"sid":np.str_, "package":np.str_, "version":np.str_, "android_id":np.str_, 
              "media_id":np.str_, "apptype":np.int,  "location":np.int, 
              "fea_hash":np.str_, "fea1_hash":np.str_, "cus_type":np.str_, "ntt":np.int, 
              "carrier":np.str_, "os":np.str_, "osv":np.str_, "lan":np.str_, 
              "dev_height":np.int, "dev_width":np.int, "dev_ppi":np.int}

In [8]:
raw_train = pd.read_csv(DATA_PATH + "train.csv", index_col=0, dtype=init_dtype)

In [9]:
raw_test = pd.read_csv(DATA_PATH + "test1.csv", index_col=0, dtype=init_dtype)

In [10]:
raw_train = reduce_mem_usage(raw_train, use_uint=False)
raw_test = reduce_mem_usage(raw_test, use_uint=False)

StartMem:68.66Mb, EndMem:57.22Mb (16.7% reduction)
StartMem:19.45Mb, EndMem:17.02Mb (12.5% reduction)


In [11]:
raw_test.shape

(150000, 19)

### 为测试集添加标签，其值全部设为-1

In [12]:
raw_test["label"] = -1

In [13]:
raw_train.shape

(500000, 20)

### 合并训练集和测试集

In [14]:
raw_data = raw_train.append(raw_test)

In [15]:
raw_data.shape

(650000, 20)

### 填充nan数据，全部设为"unk"，作为独立的一类标签

In [16]:
raw_data["lan"] = raw_data.lan.fillna("unk")

In [17]:
raw_data["osv"] = raw_data.osv.fillna("unk")

In [9]:
# 验证是否还存在空值
np.any(raw_data.isna())

False

In [19]:
# 查看各特征列数据类型
raw_data.dtypes

android_id     object
apptype         int16
carrier        object
dev_height      int16
dev_ppi         int16
dev_width       int16
label           int64
lan            object
media_id       object
ntt              int8
os             object
osv            object
package        object
sid            object
timestamp     float32
version        object
fea_hash       object
location        int16
fea1_hash      object
cus_type       object
dtype: object

In [71]:
# 查看各特征列
raw_data.columns

Index(['android_id', 'apptype', 'carrier', 'dev_height', 'dev_ppi',
       'dev_width', 'label', 'lan', 'media_id', 'ntt', 'os', 'osv', 'package',
       'sid', 'timestamp', 'version', 'fea_hash', 'location', 'fea1_hash',
       'cus_type'],
      dtype='object')

In [20]:
# 是否使用时间特征进行训练
USE_TIME = True

In [21]:
# 时间戳转换为str格式
raw_data["timestamp"] = raw_data["timestamp"].\
    apply(lambda ts: time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(ts / 1000)))

In [24]:
def transTime(df, column="timestamp"):
    """
        将str类型时间转换为datetime类型
    """
    df[column] = list(map(lambda string: datetime.datetime.strptime(string, '%Y-%m-%d %H:%M:%S'), 
                              df[column]))

In [28]:
def getTimeSplit(df):
    """
        分割出时间的各个维度
    """
    df["day"] = list(map(lambda x: x.day, df["timestamp"]))
    df["day"] = df["day"].astype(np.int8)
    df["weekday"] = list(map(lambda x: x.weekday(), df["timestamp"]))
    df["weekday"] = df["weekday"].astype(np.int8)
    df["hour"] = list(map(lambda x: x.hour, df["timestamp"]))
    df["hour"] = df["hour"].astype(np.int8)
    df["minute"] = list(map(lambda x: x.minute, df["timestamp"]))
    df["minute"] = df["minute"].astype(np.int8)
    df["second"] = list(map(lambda x: x.second, df["timestamp"]))
    df["second"] = df["second"].astype(np.int8)

In [25]:
if USE_TIME:
    transTime(raw_data)  # 转换时间戳格式
    getTimeSplit(raw_data)  # 分割出具体时间维度

In [39]:
feat_list = ['android_id', 'apptype', 'carrier', 'dev_height', 'dev_ppi',
       'dev_width', 'lan', 'media_id', 'ntt', 'os', 'osv', 'package',
       'version', 'fea_hash', 'location', 'fea1_hash', 'cus_type']
if USE_TIME:
    feat_list.extend(["day", "weekday", "hour", "minute", "second"])

In [40]:
feat_cardi_dict = {}

In [41]:
for feat in feat_list:
    feat_cardi_dict[feat] = raw_data[feat].nunique()
    print(f"{feat}:{feat_cardi_dict[feat]}")

android_id:467958
apptype:89
carrier:5
dev_height:864
dev_ppi:105
dev_width:382
lan:25
media_id:292
ntt:8
os:2
osv:165
package:2102
version:23
fea_hash:509473
location:332
fea1_hash:6147
cus_type:58
day:8
weekday:7
hour:24
minute:60
second:60


In [42]:
def lbeFeats(df, lbe_columns):
    """
    对原始特征进行编码
    如需将编码转换为原始数据可以使用:lbe.inverse_transform(df[column])
    :param df:需要处理的dataframe
    :param lbe_columns:需要进行编码的列, list
    :return:编码的lbe对象字典, dict
    """
    lbe_dict = {}
    # 对每个特征列进行lbe
    for column in lbe_columns:
        lbe = LabelEncoder()
        df[column] = lbe.fit_transform(df[column]).astype(str)
        # 将当前lbe对象进行记录，以便后续恢复原始编码需要
        lbe_dict[column + "_lbe"] = lbe
    return lbe_dict

In [43]:
lbe_dict = lbeFeats(raw_data, feat_list)

In [44]:
lbe_dict

{'android_id_lbe': LabelEncoder(),
 'apptype_lbe': LabelEncoder(),
 'carrier_lbe': LabelEncoder(),
 'dev_height_lbe': LabelEncoder(),
 'dev_ppi_lbe': LabelEncoder(),
 'dev_width_lbe': LabelEncoder(),
 'lan_lbe': LabelEncoder(),
 'media_id_lbe': LabelEncoder(),
 'ntt_lbe': LabelEncoder(),
 'os_lbe': LabelEncoder(),
 'osv_lbe': LabelEncoder(),
 'package_lbe': LabelEncoder(),
 'version_lbe': LabelEncoder(),
 'fea_hash_lbe': LabelEncoder(),
 'location_lbe': LabelEncoder(),
 'fea1_hash_lbe': LabelEncoder(),
 'cus_type_lbe': LabelEncoder(),
 'day_lbe': LabelEncoder(),
 'weekday_lbe': LabelEncoder(),
 'hour_lbe': LabelEncoder(),
 'minute_lbe': LabelEncoder(),
 'second_lbe': LabelEncoder()}

In [45]:
raw_data.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,lan,media_id,ntt,...,version,fea_hash,location,fea1_hash,cus_type,day,weekday,hour,minute,second
0,159090,81,2,0,0,0,1,15,7,6,...,14,151517,1,2129,37,5,4,7,32,1
1,26571,31,1,0,0,0,1,15,112,6,...,9,236607,2,2952,0,6,5,11,40,40
2,219746,22,1,226,0,22,1,15,194,0,...,1,52632,3,5580,42,4,3,15,59,13
3,432118,49,2,729,0,121,0,15,61,2,...,1,338555,4,473,45,7,6,1,0,12
4,139122,62,2,763,0,121,1,17,212,2,...,10,181787,5,840,36,5,4,0,28,13


In [46]:
columns = [
            'sid',  # 样本id
    
            'android_id',  # 媒体信息，广告位id
            'media_id',  # 媒体信息，媒体id
            'package',  # 媒体信息，包名
            'apptype',  # 媒体信息，app所属分类
            'version',  # 媒体信息，app版本
            
            'fea_hash',  # 用户特征编码
            'fea1_hash',  # 用户特征编码
            'cus_type',  # 用户特征编码
            'location',  # 用户地理位置编码
    
            'carrier',  # 设备信息，运营商
            'dev_height',  # 设备信息，高
            'dev_width',  # 设备信息，宽
            'dev_ppi',  # 设备信息，分辨率
            'lan',  # 设备信息，语言(默认中文)
            'ntt',  # 设备信息，网络类型
            'os',  # 设备信息，操作系统(默认安卓)
            'osv',  # 设备信息，操作系统版本
            
            'timestamp',  # 交互信息，请求到达服务时间，单位ms
            
            'day',
            'weekday',
            'hour',
            'minute', 
            'second',
            
            'label'
        ]

In [47]:
raw_data = raw_data[columns]

In [48]:
raw_data.head()

Unnamed: 0,sid,android_id,media_id,package,apptype,version,fea_hash,fea1_hash,cus_type,location,...,ntt,os,osv,timestamp,day,weekday,hour,minute,second,label
0,1438873,159090,7,817,81,14,151517,2129,37,1,...,6,1,127,2019-06-07 07:32:01,5,4,7,32,1,1
1,1185582,26571,112,0,31,9,236607,2952,0,2,...,6,1,125,2019-06-08 11:40:40,6,5,11,40,40,1
2,1555716,219746,194,0,22,1,52632,5580,42,3,...,0,1,126,2019-06-06 15:59:13,4,3,15,59,13,1
3,1093419,432118,61,0,49,1,338555,473,45,4,...,2,1,126,2019-06-09 01:00:12,7,6,1,0,12,0
4,1400089,139122,212,0,62,10,181787,840,36,5,...,2,1,122,2019-06-07 00:28:13,5,4,0,28,13,1


In [49]:
raw_train = raw_data.loc[raw_data.label >= 0]
raw_test = raw_data.loc[raw_data.label == -1]

In [50]:
raw_train.shape

(500000, 25)

In [51]:
raw_test.shape

(150000, 25)

In [52]:
# 保存数据
if USE_TIME:
    raw_train.to_csv(DATA_PATH + "processData/lbe_raw_train_with_time.csv", index=False)
    raw_test.to_csv(DATA_PATH + "processData/lbe_raw_test_with_time.csv", index=False)
else:
    raw_train.to_csv(DATA_PATH + "processData/lbe_raw_train.csv", index=False)
    raw_test.to_csv(DATA_PATH + "processData/lbe_raw_test.csv", index=False)