In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import gc
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import seaborn as sns
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import time
import datetime
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss
from scipy import sparse
from tqdm import tqdm_notebook

import warnings
warnings.filterwarnings("ignore")



In [3]:
# 加载数据
train = pd.read_table('../data/round1_iflyad_train.txt')
test = pd.read_table('../data/round1_iflyad_test_feature.txt')
# 合并训练集，验证集
data = pd.concat([train,test],axis=0,ignore_index=True)
data['click'] = data['click'].fillna(-1)

### 特征工程-数据清洗、特征构建

In [17]:
def process(data):
    # 缺失值填充
    data['make'] = data['make'].fillna(str(-1))
    data['model'] = data['model'].fillna(str(-1))
    data['osv'] = data['osv'].fillna(str(-1))
    data['app_cate_id'] = data['app_cate_id'].fillna(-1)
    data['app_id'] = data['app_id'].fillna(-1)
    data['user_tags'] = data['user_tags'].fillna(str(-1))
    data['f_channel'] = data['f_channel'].fillna(str(-1))
    #数据预处理
    data['model'] = data['model'].apply(lambda x: str(x).lower())
    data['make'] = data['make'].apply(lambda x: str(x).lower())
    #这个因为是url编码的缘故，考虑存在相同的特殊符号可能存在某种联系
    a1= data.model.apply(lambda x: '+' if '+' in str(x) else 
                         '-' if '-' in str(x) else \
                         '_' if '_' in str(x) else \
                         ',' if ',' in str(x) else \
                         'chinese' if u'[\u4E00-\u9FA5]' in str(x) else \
                         '%2b' if '%2b' in str(x) else \
                         '%20' if '%20' in str(x) else \
                         '%2522' if '%2522' in str(x) else \
                         '%25' if '%25' in str(x) else \
                         'other')
    data['sim_ip'] = a1

    #make字段异常值清洗
    new = data[['make', 'model', 'instance_id']]
    new['make_model'] = new['make']+':::'+new['model']
    for i in ['apple', 'oppo', 'vivo', 'huawei', 'lenovo', 'zte', 'xiaomi', 'meizu', 'gionee', 'samsung', 'honor', '360', 'lemobile',\
              'zte', 'letv', 'cmdc', 'hisense', 'oneplus', 'nubia', 'yulong', 'smartisan', 'coolpad', 'doov', 'bbk', 'xiaolajiao',  \
              'le', 'koobee', 'blephone', 'meitu', 'sprd', 'alps', 'konka', 'leeco', 'sugar', 'lephone', 'zuk', 'pa', 'htc',  \
              'yufly', 'tcl', 'ipad', 'changhong', 'sony', 'android', 'sm', 'yufly', 'mha', 'motorola', 'bln', 'vtr',    \
              'asus', '4g', 'ivvi', 'lge', 'qingcheng', 'qiku', 'kopo', 'saga']:
        length = len(i)
        new['make_model'] = new['make_model'].apply(lambda x: i if ((len(x.split(':::')) > 1) and  ((x.split(':::')[0][:length] == i) | (x.split(':::')[1][:length] == i))) else x)
        new['make_model'] = new['make_model'].apply(lambda x: i if ((len(x.split(':::')) > 1) and  ((i in x.split(':::')[0]) | (i in x.split(':::')[1]))) else x)
    new['make_model'] = new['make_model'].apply(lambda x: 'apple' if x[:6] == 'iphone' else x)
    new['make_model'] = new['make_model'].apply(lambda x: 'xiaomi' if 'mi' in x else x)
    new['make_model'] = new['make_model'].apply(lambda x: 'apple' if 'phone' in x else x)
    new['make_model'] = new['make_model'].apply(lambda x: 'apple' if 'ios' in x else x)
    new['make_model'] = new['make_model'].apply(lambda x: 'meizu' if 'mx' in x else x)
    new.loc[new.make_model.isin((new.make_model.value_counts()[new.make_model.value_counts() <= 200]).index), 'make_model'] = 'other' #去除低频词
    data['clear_make'] = new['make_model'].copy()
    print('make变量处理完毕')

    #model字段异常值清洗，正则表达式处理
    import re
    a = data['model'].copy()
    data.loc[data.model.isnull(), 'model'] = 'oppoa7'   ##用众数填充
    a = a.apply(lambda x: re.sub(r'[%20]',"",x))
    a = a.apply(lambda x: re.sub(r'[%252525252525252b]',"",x))
    a = a.apply(lambda x: re.sub(r'[%25252b]',"",x))
    a = a.apply(lambda x: re.sub(r'[%25252525252b]',"",x))
    a = a.apply(lambda x: re.sub(u'[\u4E00-\u9FA5]',"chinese",x))
    a = a.apply(lambda x: re.sub(r' ',"",x))
    a = a.apply(lambda x: re.sub(r'[+,-_mtk]',"",x))
    for i in range(12):
        a = a.apply(lambda x: 'vivoy'+str(i) if 'vivoy'+str(i) in x else x )
        a = a.apply(lambda x: 'vivox'+str(i) if 'vivox'+str(i) in x else x )
        a = a.apply(lambda x: 'vivoxplay'+str(i) if 'vivoxplay'+str(i) in x else x )
        a = a.apply(lambda x: 'oppoa'+str(i) if 'oppoa'+str(i) in x else x )
        a = a.apply(lambda x: 'oppor'+str(i) if 'oppor'+str(i) in x else x )
        a = a.apply(lambda x: 'huaweip'+str(i) if 'huaweip'+str(i) in x else x )
        a = a.apply(lambda x: 'iphone'+str(i) if 'iphone'+str(i) in x else x )
    a = a.apply(lambda x: 'iphone8' if 'iphone8' in x else x )
    data['clear_model'] = a
    data.loc[data.clear_model.isin(a.value_counts()[a.value_counts() < 300].index), 'clear_model'] = data.loc[data.clear_model.isin(a.value_counts()[a.value_counts() < 300].index), 'clear_model'].apply(lambda x: x[:-1])
    for i in ['apple', 'oppo', 'vivo', 'huawei', 'lenovo', 'zte', 'xiaomi', 'meizu', 'gionee', 'samsung', 'honor', '360', 'lemobile',     'zte', 'letv', 'cmdc', 'hisense', 'oneplus', 'nubia', 'yulong', 'smartisan', 'coolpad', 'doov', 'bbk', 'xiaolajiao',     'le', 'koobee', 'blephone', 'meitu', 'sprd', 'alps', 'konka', 'leeco', 'sugar', 'lephone', 'zuk', 'pa', 'htc',     'yufly', 'tcl', 'ipad', 'changhong', 'sony', 'android', 'sm', 'yufly', 'mha', 'motorola', 'bln', 'vtr',      'asus', '4g', 'ivvi', 'lge', 'qingcheng', 'qiku', 'kopo', 'saga']:
        data.loc[data.clear_model.isin(data.clear_model.value_counts()[data.clear_model.value_counts() < 300].index), 'clear_model'] =  data.loc[data.clear_model.isin(data.clear_model.value_counts()[data.clear_model.value_counts() < 300].index), 'clear_model'].apply(lambda x: i if i in x else x)
    data.loc[data.clear_model.isin((data.clear_model.value_counts()[data.clear_model.value_counts() <= 300]).index), 'clear_model'] = 'other'
    print('model 变量处理完毕')

    #对于操作系统的处理
    import re
    new = data[['osv', 'instance_id']]
    new.loc[new.osv.isnull(), 'osv'] = '6.0.1'
    new['osv'] = new['osv'].apply(lambda x: str(x).lower())
    new['digit_osv'] = new['osv'].apply(lambda x: re.findall(r"\d+\.\d+\.\d*",x)[0] if len(re.findall(r"\d+\.\d+\.\d*",x)) > 0 else  re.findall(r"\d+\.?\d*", x)[0] if len(re.findall(r"\d+\.?\d*", x)) > 0 else 'other')
    new['clear_osv'] = data['os_name'] + new['digit_osv']
    data['clear_osv'] = new['clear_osv'].copy()
    print('操作系统osv处理完毕')

    # replace
    replace = ['creative_is_jump', 'creative_is_download', 'creative_is_js', 'creative_is_voicead', 'creative_has_deeplink', 'app_paid']
    for feat in replace:
        data[feat] = data[feat].replace([False, True], [0, 1])
    # labelencoder 转化
    encoder = ['city', 'province', 'make', 'model', 'osv', 'os_name', 'adid', 'advert_id', 'orderid',
               'advert_industry_inner', 'campaign_id', 'creative_id', 'app_cate_id', 'sim_ip',
               'app_id', 'inner_slot_id', 'advert_name', 'f_channel', 'creative_tp_dnf', 'clear_model', 'clear_make', 'clear_osv']
    lbl = LabelEncoder()
    for feat in encoder:
        lbl.fit(data[feat])
        data[feat] = lbl.transform(data[feat])
        
    return data


### 后续组合特征，历史专户率，广告瀑光率特征

In [29]:
def combine_feature_time_feature(data):
    lbl = LabelEncoder()
    #组合特征
    data['meiti_con_inner'] = data['app_id'].astype(str) + data['inner_slot_id'].astype(str)
    data['meiti_con_inner'] = lbl.fit_transform(data['meiti_con_inner'])
    data['meiti_con_inner_con_channel'] = data['app_id'].astype(str) + data['inner_slot_id'].astype(str) + data['f_channel'].astype(str)
    data['meiti_con_inner_con_channel']  = lbl.fit_transform(data['meiti_con_inner_con_channel'] )
    data['app_cate_con_adid'] = data['app_cate_id'].astype(str) + data['adid'].astype(str)
    data['app_cate_con_adid'] = lbl.fit_transform(data['app_cate_con_adid'])
    data['app_cate_con_meiti'] = data['app_cate_id'].astype(str) + data['app_id'].astype(str)
    data['app_cate_con_meiti'] = lbl.fit_transform(data['app_cate_con_meiti'])
    #加入
    data['model_con_osv'] = data['model'].astype(str) + data['osv'].astype(str)
    data['model_con_osv'] = lbl.fit_transform(data['model_con_osv'])
    data['model_con_city'] = data['model'].astype(str) + data['city'].astype(str)
    data['model_con_city'] = lbl.fit_transform(data['model_con_city'])

    data['day'] = data['time'].apply(lambda x : int(time.strftime("%d", time.localtime(x))))
    data['hour'] = data['time'].apply(lambda x : int(time.strftime("%H", time.localtime(x)))) 
    # 历史点击率
    # 时间转换
    data['period'] = data['day']
    data['period'][data['period']<27] = data['period'][data['period']<27] + 31

    for feat_1 in ['advert_id','advert_industry_inner','advert_name','campaign_id', 'creative_height',
                   'creative_tp_dnf', 'creative_width', 'province', 'f_channel']:
        gc.collect()
        res=pd.DataFrame()
        temp=data[[feat_1,'period','click']]
        for period in range(27,35):
            if period == 27:
                count=temp.groupby([feat_1]).apply(lambda x: x['click'][(x['period']<=period).values].count()).reset_index(name=feat_1+'_all')
                count1=temp.groupby([feat_1]).apply(lambda x: x['click'][(x['period']<=period).values].sum()).reset_index(name=feat_1+'_1')
            else: 
                count=temp.groupby([feat_1]).apply(lambda x: x['click'][(x['period']<period).values].count()).reset_index(name=feat_1+'_all')
                count1=temp.groupby([feat_1]).apply(lambda x: x['click'][(x['period']<period).values].sum()).reset_index(name=feat_1+'_1')
            count[feat_1+'_1']=count1[feat_1+'_1']
            count.fillna(value=0, inplace=True)
            count[feat_1+'_rate'] = round(count[feat_1+'_1'] / count[feat_1+'_all'], 5)
            count['period']=period
            count.drop([feat_1+'_all', feat_1+'_1'],axis=1,inplace=True)
            count.fillna(value=0, inplace=True)
            res=res.append(count,ignore_index=True)
        print(feat_1,' over')
        data = pd.merge(data,res, how='left', on=[feat_1,'period'])
        
    #广告的曝光率 提升5个w
    adid_nuq=['model','make','os','city','province', 'f_channel','app_id','carrier','nnt', 'devtype',
             'app_cate_id','inner_slot_id']
    for fea in tqdm_notebook(adid_nuq):
        gp1=data.groupby('adid')[fea].nunique().reset_index().rename(columns={fea:"adid_%s_nuq_num"%fea})
        gp2=data.groupby(fea)['adid'].nunique().reset_index().rename(columns={'adid':"%s_adid_nuq_num"%fea})
        data=pd.merge(data,gp1,how='left',on=['adid'])
        data=pd.merge(data,gp2,how='left',on=[fea])   
        gc.collect()
        
    return data


In [18]:
#初步处理
data = process(data)

make变量处理完毕
model 变量处理完毕
操作系统osv处理完毕


In [30]:
### 后续处理
data = combine_feature_time_feature(data)

advert_id  over
advert_industry_inner  over
advert_name  over
campaign_id  over
creative_height  over
creative_tp_dnf  over
creative_width  over
province  over
f_channel  over


HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




In [32]:
import pickle
##存储中间特征矩阵便于再次访问
with open('../data/temp.pkl', 'wb') as file:
    pickle.dump(data, file)

In [33]:
## 读取特征矩阵
with open('../data/temp.pkl', 'rb') as file:
    data = pickle.load(file)

print(data.head(10))

   adid  advert_id  advert_industry_inner  advert_name  app_cate_id  app_id  \
0   553          1                     17           25            8     233   
1    98          1                     17           25            9      17   
2   190          2                     11           32            1     255   
3  1072         33                     16            0            1     157   
4    10          1                     17           25            4       5   
5   522         12                     21           20            4     162   
6  1536         32                      6            6            1     138   
7   553          1                     17           25            8     233   
8  1072         33                     16            0            1     157   
9  1126          1                     17           25            1     255   

   app_paid  campaign_id  carrier  city             ...              \
0         0            1        1    78             ...    

In [34]:
data.head()

Unnamed: 0,adid,advert_id,advert_industry_inner,advert_name,app_cate_id,app_id,app_paid,campaign_id,carrier,city,...,adid_carrier_nuq_num,carrier_adid_nuq_num,adid_nnt_nuq_num,nnt_adid_nuq_num,adid_devtype_nuq_num,devtype_adid_nuq_num,adid_app_cate_id_nuq_num,app_cate_id_adid_nuq_num,adid_inner_slot_id_nuq_num,inner_slot_id_adid_nuq_num
0,553,1,17,25,8,233,0,1,1,78,...,1,2054,3,2032,1,2091,1,674,1,89
1,98,1,17,25,9,17,0,1,3,234,...,3,1248,5,2032,2,2091,1,697,1,55
2,190,2,11,32,1,255,0,0,3,108,...,3,1248,2,2032,1,2091,1,1010,1,65
3,1072,33,16,0,1,157,0,38,0,86,...,1,188,4,2032,1,2091,1,1010,10,5
4,10,1,17,25,4,5,0,1,1,82,...,3,2054,2,812,1,2091,1,822,1,30
