In [5]:
import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
import lightgbm as lgb   # boosting的升级版 xgboostd 的运行速度会慢一点
import time
import pandas as pd
import numpy as np
import sys

ModuleNotFoundError: No module named 'pandas'

#### 1、导入数据

In [2]:
train = pd.read_csv('./round1_iflyad_train.txt',sep='\t')
test = pd.read_csv('./round1_iflyad_test_feature.txt',sep='\t')

#### 2、数据探索

In [3]:
# train.head(5)
# train.info()
train.isnull().sum()

instance_id                   0
time                          0
city                          0
province                      0
user_tags                309770
carrier                       0
devtype                       0
make                      98917
model                      7402
nnt                           0
os                            0
osv                        7772
os_name                       0
adid                          0
advert_id                     0
orderid                       0
advert_industry_inner         0
campaign_id                   0
creative_id                   0
creative_tp_dnf               0
app_cate_id                2267
f_channel                925260
app_id                     2267
inner_slot_id                 0
creative_type                 0
creative_width                0
creative_height               0
creative_is_jump              0
creative_is_download          0
creative_is_js                0
creative_is_voicead           0
creative

* 不平衡处理

In [4]:
train['click'].value_counts()

0    802863
1    198787
Name: click, dtype: int64

In [5]:
x = train.drop('click',axis=1)
y = train['click']

* --------学习过采样，无需运行-------------------

In [6]:
from imblearn.over_sampling import RandomOverSampler  # 过采样

 random_state = 0 随机种子,固定随机数.以后运行代码,可以复现代码,复现整个效果.随机造成模型没有以前好

In [7]:
ros = RandomOverSampler(sampling_strategy=0.4,random_state=0)

In [8]:
X_resampled, y_resampled = ros.fit_resample(x, y)  # resanple :重新取样

In [36]:
X_resampled.shape

(1124008, 34)

In [38]:
y.value_counts()

0    802863
1    198787
Name: click, dtype: int64

In [40]:
y_resampled.value_counts()

0    802863
1    321145
Name: click, dtype: int64

In [None]:
321145/802863

* --------学习过采样，无需运行-------------------

* 合并

In [None]:
click = train['click']
train = train.drop('click',axis=1)
df = pd.concat([train,test],axis=0,ignore_index=True)

#### 3、数据预处理

* 缺失值处理

In [None]:
# 比赛常用的缺失值处理技巧
df = df.fillna(-1)

* 时间提取

In [None]:
df['day'] = df['time'].apply(lambda x: int(time.strftime("%d", time.localtime(x))))
df['hour'] = df['time'].apply(lambda x: int(time.strftime("%H", time.localtime(x))))

* 布尔型数据转换

In [None]:
list(filter(lambda s:s=='bool',[df[i].dtype for i in df.columns]))
bool_feature = list(filter(lambda s:s!=0,[i if df[i].dtype=='bool' else 0  for i in df.columns]))

# 转换bool
for i in bool_feature:
    df[i] = df[i].astype(int)   #啥都不写，只写int的话，则默认为int32.

* advert_industry_inner特征提取

In [None]:
df['advert_industry_inner_1'] = df['advert_industry_inner'].apply(lambda x: x.split('_')[0])

* 将广告相关特征放到一个列表

In [None]:
ad_cate_feature = ['adid', 'advert_id', 'orderid', 'advert_industry_inner_1', 'advert_industry_inner', 'advert_name',
                   'campaign_id', 'creative_id', 'creative_type', 'creative_tp_dnf', 'creative_has_deeplink',
                   'creative_is_jump', 'creative_is_download']

In [None]:
# 此特征仅有一个元素，方差为0。无效特征
df['creative_is_js'].value_counts()

In [None]:
# creative_height  ，creative_width为数值型特征，不放入列表中。

* 将媒体相关特征放入列表

In [None]:
media_cate_feature = ['app_cate_id', 'f_channel', 'app_id', 'inner_slot_id']

In [None]:
# 此特征仅有一个元素，方差为0。无效特征
df['app_paid'].value_counts()

* 上下文特征（用户信息） 

In [None]:
content_cate_feature = ['city', 'carrier', 'province', 'nnt', 'devtype', 'osv', 'os', 'make', 'model']

In [None]:
# 与os完全重合了，所以不纳入列表中
df['os_name'].value_counts()

In [None]:
df['os'].value_counts()

* 列表合并

In [None]:
origin_cate_list = ad_cate_feature + media_cate_feature + content_cate_feature

* 将分类特征labelencode：

In [None]:
for i in origin_cate_list:
    df[i] = df[i].map(dict(zip(df[i].unique(), range(0, df[i].nunique()))))

In [None]:
df['os_name'].map(dict(zip(df['os_name'].unique(), range(0, df['os_name'].nunique())))).value_counts()

In [None]:
pd.DataFrame(df['os_name'].value_counts()).reset_index()

In [None]:
# 查看目前特征数量
df.head()

#### 4、特征工程

In [None]:
count_feature_list = []
def feature_count(data, features=[], is_feature=True):

    ### 第一个代码逻辑块 ###
    if len(set(features)) != len(features): 
        print('equal feature !!!!')
        return data
    ### 第一个代码逻辑块 ###
    
    
    
    ### 第二个代码逻辑块 ###
    new_feature = 'count'

    nunique = []
    for i in features:
        nunique.append(data[i].nunique())
        new_feature += '_' + i.replace('add_', '')
        
    ### 第二个代码逻辑块 ###
    
    
    ### 第三个代码逻辑块 ###
    if len(features) > 1 and len(data[features].drop_duplicates()) <= np.max(nunique):
        print(new_feature, 'is unvalid cross feature:')
        return data
    ### 第二个代码逻辑块 ### 
    
    
    ### 第四个代码逻辑块 ###
    temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})

    data = data.merge(temp, 'left', on=features)
    ### 第四个代码逻辑块 ###
  

    ### 第五个代码逻辑块 ###
    if is_feature:
        count_feature_list.append(new_feature)
        
    if 'day_' in new_feature:
        print('fix:', new_feature)
        data.loc[data.day == 3, new_feature] = data[data.day == 3][new_feature] * 4
    ### 第五个代码逻辑块 ###
    
    return data

In [None]:
for i in origin_cate_list:
    n = df[i].nunique()
    if n > 5:
        df = feature_count(df, [i])
        df = feature_count(df, ['day', 'hour', i])

* ----------------------理解以上代码，不执行----------------------------

In [None]:
origin_cate_list

* 拆解1：把函数参数列好

In [None]:
# 函数参数
# features = ['creative_id']
features  = ['day', 'hour', 'creative_id']
data = df.copy()
is_feature = True
n = df[features].nunique()


* （1）查看第一个函数逻辑块

In [None]:
# n相当于拿该特征的独立元素个数。
n = data[features[0]].nunique()

# 如果n>5，并且features里有重复值，则不去操作
if n > 5:
    if len(set(features)) != len(features):  
        print('equal feature !!!!')

* （2）查看第二个函数逻辑块

In [None]:
# 改变特征名
new_feature = 'count' 
nunique = []
for i in features:
    nunique.append(data[i].nunique())
    new_feature += '_' + i.replace('add_', '')

In [None]:
new_feature

In [None]:
# 改变特征名
new_feature = 'count' 
nunique = []
for i in features2:
    nunique.append(data[i].nunique())
    new_feature += '_' + i.replace('add_', '')

In [None]:
new_feature

* （3）查看第三个函数逻辑块

In [None]:
# 不符合以下条件就是无效的交叉特征，不作处理
# 条件：features列表里不止1个元素  并且 features列的独立元素特别少的话 
if len(features) > 1 and len(data[features].drop_duplicates()) <= np.max(nunique):
    print(new_feature, 'is unvalid cross feature:')

* （4）【核心】查看第四个函数逻辑块

In [None]:
temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})

data = data.merge(temp, 'left', on=features)

In [None]:
data.groupby(features2).size().reset_index().rename(columns={0: new_feature})

In [None]:
data['creative_id'].value_counts()

* **其实这个函数的核心就是“相当于遍历了所有特征，做了一个新的统计特征，仅此而已**
* **要注意，如果输入的是['day', 'hour', 'creative_id']，相当于是计算这三个交叉特征的统计特征。**

* （5）查看第五个函数逻辑块

In [None]:
count_feature_list = []
if is_feature:
    count_feature_list.append(new_feature)

#对于长这样（count_day_hour_app_id）的String类型变量new_feature，我们把它打印一下；并且把data.day == 3所在行的new_feature列乘以4并重新赋值；
if 'day_' in new_feature:
    print('fix:', new_feature)
    data.loc[data.day == 3, new_feature] = data[data.day == 3][new_feature] * 4

* 可以查看df['day'].value_counts() ==3的个数，量级太少了，如果不把它变成与其他day相同量级，那么会有量纲的差异

In [None]:
df['day'].value_counts()

* ------------理解代码，不执行----------------------------

* **4.2、比例特征**

In [None]:
# 运行了51min
ratio_feature_list = []
for i in media_cate_feature:
    for j in content_cate_feature + ad_cate_feature:
        new_feature = 'inf_' + i + '_' + j
        df = feature_count(df, [i, j])
        # 独立元素要大于5才做
        if df[i].nunique() > 5 and df[j].nunique() > 5:
            df['ratio_' + j + '_of_' + i] = df[ 'count_' + i + '_' + j] / df['count_' + i]
            df['ratio_' + i + '_of_' + j] = df['count_' + i + '_' + j] / df['count_' + j]
            ratio_feature_list.append('ratio_' + j + '_of_' + i)
            ratio_feature_list.append('ratio_' + i + '_of_' + j)
            print(i,'&',j)

In [None]:
# 导出数据
# df.to_pickle('df_ratioed.pkl')

In [None]:
#读入数据
df = pd.read_pickle('df_ratioed.pkl')

* ---------------如果跑了比例特征，这些就不需要执行----------------

In [None]:
ratio_feature_list = list(pd.Series(df.columns)[pd.Series(df.columns).str.contains('ratio')])

In [None]:
count_feature_list = list(pd.read_csv('count_list.csv')['0'])

In [None]:
drop_l = ['count_adid_province', 'count_adid_model', 'count_adid_carrier', 'count_adid_devtype', 'count_adid_os', 'count_adid_nnt', 'count_adid_make', 'count_adid_city', 'count_adid_osv'] 

In [None]:
count_feature_list = list(pd.Series(count_feature_list)[pd.Series(count_feature_list).isin(drop_l)==False])

* ---------------如果跑了比例特征，这些就不需要执行----------------

* -------“媒体特征”与“广告+上下文特征”进行交叉，并计算比例特征，理解，不需要运行----------------

In [None]:
# 媒体特征
i = 'app_cate_id'

In [None]:
# 上下文特征
j = 'city'

In [None]:
new_feature = 'inf_' + i + '_' + j

In [None]:
# # 【不运行】做了个计数
# df = feature_count(df, [i, j])

In [None]:
df['ratio_' + j + '_of_' + i] = df[ 'count_' + i + '_' + j] / df['count_' + i]
df['ratio_' + i + '_of_' + j] = df['count_' + i + '_' + j] / df['count_' + j]

In [None]:
df.groupby([i,j],as_index=False)['instance_id'].count()

In [None]:
# df[ 'count_' + i + '_' + j] / df['count_' + i], 1986/303758
df.groupby(i,as_index=False)['instance_id'].count()

In [None]:
# df['count_' + i + '_' + j] / df['count_' + j],1986/8663
df.groupby(j,as_index=False)['instance_id'].count()

* -------“媒体特征”与“广告+上下文特征”进行交叉，并计算比例特征，理解，不需要运行----------------

* **4.3特征列表组合，新增统计特征和比例特征的字段**

In [None]:
cate_feature = origin_cate_list
num_feature = ['creative_width', 'creative_height', 'hour'] + count_feature_list + ratio_feature_list
feature = cate_feature + num_feature
print(len(feature), feature)

* **4.4低频过滤**

In [None]:
for feature in cate_feature:
    if 'count_' + feature in df.keys(): #它这里不是一个DataFrame了，所以没有columns()属性了；它是一个Table,有keys()属性；
        print(feature)
        df.loc[df['count_' + feature] < 2, feature] = -1
        df[feature] = df[feature] + 1

* 把计数值'count_' + feature为1的行数（不管你是那个特征计数值为1，只要出现了1）的这个特征赋值为-1；因为你没啥用；
* 最后把这个特征值在原有基础上再加1；原---原+1；-1-----0.

* **4.5构建训练和测试集**

In [None]:
label = list(click) + [-1]* (len(df) - len(click))

In [None]:
df['label'] = label

In [None]:
# 测试集数据
predict = df[df.label == -1]

In [None]:
#建立一个df,名为predict_result，它现在只有一列，就是上面predict的instance_id列。
predict_result = predict[['instance_id']]

#给predict_result添加一列，列名为'predicted_score'，并初始化为0.
predict_result['predicted_score'] = 0

In [None]:
##建立一个df,名为predict_x，它就是上面predict去掉'label'列之后的结果，也就是特征向量集。
predict_x = predict.drop('label', axis=1)
#建立训练集_x,而且里面全是正例样本。drop=True一定要写，意为返回一个DataFrame.
train_x = df[df.label != -1].reset_index(drop=True)
#pop()的功能就是删除并返回所删除的内容。将之传给训练集_y.
train_y = train_x.pop('label').values

In [None]:
train_x.shape

In [None]:
predict_x.shape

In [None]:
# t = []
# for i in cate_feature:
#     t.append(len(set(train_x[i])))

* **4.6稀疏矩阵建立**

![企业微信截图_16125154901209.png](https://i.loli.net/2021/02/05/Qvl1KOGtRW3hqLo.png)

* **稀疏矩阵的好处**
* 1. 压缩矩阵对象的内存空间
* 2. 加速多数机器学习程序

In [None]:
#训练集_x压缩稀疏矩阵基
base_train_csr = sparse.csr_matrix((len(train_x), 0))

In [None]:
#预测集_x压缩稀疏矩阵基
base_predict_csr = sparse.csr_matrix((len(predict_x), 0))

* **4.7 one-hotencode**

In [None]:
#这个 sparse.hstack(, 'csr', 'bool')非常必要，代表粗细粒度。 以后再使用hstack的时候，
#要从粗粒度往细粒度加，如bool->int32->float32->float64,，要不然细粒度的特征就会被压缩，信息损失很多，
enc = OneHotEncoder()
for feature in cate_feature:
    # 训练onehotencoder
    enc.fit(df[feature].values.reshape(-1, 1))
    base_train_csr = sparse.hstack((base_train_csr, enc.transform(train_x[feature].values.reshape(-1, 1))), 'csr', 'bool')
    base_predict_csr = sparse.hstack((base_predict_csr, enc.transform(predict_x[feature].values.reshape(-1, 1))), 'csr', 'bool')
print('one-hot prepared !')

In [None]:
base_train_csr.shape

In [None]:
base_predict_csr.shape

* ------------讲解hstack，不执行-----------------
* 将矩阵按照列进行拼接

In [None]:
from scipy.sparse import coo_matrix, hstack
A = coo_matrix([[1, 2], [3, 4]])
B = coo_matrix([[5,7], [6,8]])

In [None]:
A.toarray()

In [None]:
B.toarray()

In [None]:
hstack([A,B]).toarray()

* ------------讲解hstack，不执行-----------------

* **4.8、user_tags特征**

In [None]:
#建一个文本特征提取器
cv = CountVectorizer(min_df=20)
#对于特征'user_tags'，仅这一条，嗯哼，你没看错；对其进行文本特征提取并且跟之前的矩阵基进行水平方向上的合并。
#.astype(str)非常有必要，因为可能它原来是obj类型，不管它以前是啥，反正这样肯定不会错。
for feature in ['user_tags']:
    df[feature] = df[feature].astype(str)
    cv.fit(df[feature])
    base_train_csr = sparse.hstack((base_train_csr, cv.transform(train_x[feature].astype(str))), 'csr', 'bool')
    base_predict_csr = sparse.hstack((base_predict_csr, cv.transform(predict_x[feature].astype(str))), 'csr', 'bool')
print('cv prepared !')

In [None]:
# min_df = 20表示“忽略少于20个文档中出现的术语”.
cv = CountVectorizer(min_df=20)

In [None]:
base_train_csr.shape

In [None]:
base_predict_csr.shape

In [None]:
# 单个对象的内存大小，单位是Byte,转换为MB
sys.getsizeof(train_x)/1024/1024

In [None]:
# 单个对象的内存大小，单位是Byte
sys.getsizeof(base_train_csr)

* **特征选择**

In [None]:
from sklearn.feature_selection import SelectKBest,SelectPercentile
from sklearn.feature_selection import chi2

In [None]:
# 运行太久了
# SelectKBest(chi2, percentile=10).fit_transform(train_x, train_y)

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
sel_var = VarianceThreshold(threshold=0.001)

In [None]:
sel_var.fit(base_train_csr)

In [None]:
base_train_csr = sel_var.transform(base_train_csr)

In [None]:
base_predict_csr = sel_var.transform(base_predict_csr)

* **4.9、稀疏矩阵从bool转换成float**

In [None]:
#训练集，预测集压缩稀疏矩阵版本；
#将数字特征列表num_feature跟之前的矩阵基进行水平方向上的合并。
train_csr = sparse.hstack( (sparse.csr_matrix(train_x[num_feature]), base_train_csr), 'csr').astype('float32')
predict_csr = sparse.hstack((sparse.csr_matrix(predict_x[num_feature]), base_predict_csr), 'csr').astype('float32')

In [None]:
sys.getsizeof(train_csr)

#### 5、构建模型及交叉验证

In [None]:
#建立一个lgb_model，LGBM分类器。
lgb_model = lgb.LGBMClassifier(
    boosting_type='gbdt', num_leaves=61, reg_alpha=3, reg_lambda=1,
    max_depth=-1, n_estimators=5000, objective='binary',
    subsample=0.8, colsample_bytree=0.8, subsample_freq=1,
    learning_rate=0.035, random_state=2018, n_jobs=10
)
#建立一个分层K折采样器。为5折。
skf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True)

* **5.1、模型训练及评估**  
verbose=10 相当于打印训练的一些情况

In [None]:
# 运行约27分钟
best_score = []
for index, (train_index, test_index) in enumerate(skf.split(train_csr, train_y)):
    lgb_model.fit(train_csr[train_index], train_y[train_index],
                  eval_set=[(train_csr[train_index], train_y[train_index]),
                            (train_csr[test_index], train_y[test_index])], early_stopping_rounds=200, verbose=10)
    best_score.append(lgb_model.best_score_['valid_1']['binary_logloss'])
    print(best_score)
    #如果在训练期间启用了早期停止，可以通过best_iteration方式从最佳迭代中获得预测
    test_pred = lgb_model.predict_proba(predict_csr, num_iteration=lgb_model.best_iteration_)[:, 1]
    predict_result['predicted_score'] = predict_result['predicted_score'] + test_pred
predict_result['predicted_score'] = predict_result['predicted_score'] / 5
mean = predict_result['predicted_score'].mean()
print('mean:', mean)

* **5.2、模型保存**

In [None]:
import pickle

In [None]:
lgb_model

In [None]:
#保存Model(注:save文件夹要预先建立，否则会报错)
with open('lgb.pickle', 'wb') as f:
    pickle.dump(lgb_model, f)

In [None]:
#读取Model
with open('lgb.pickle', 'rb') as f:
    lgb_model2 = pickle.load(f)

In [None]:
# 预测
y_p = lgb_model2.predict(train_csr)

In [None]:
from sklearn.metrics import log_loss

In [None]:
# 预测概率
y_pro = lgb_model2.predict_proba(train_csr)

In [None]:
# 对数损失
log_loss(train_y,y_pro)