### Stacking及增量学习

In [1]:
import pandas as pd
df = pd.read_csv('./ccf_offline_stage1_train-Copy1.csv', nrows = 200000)
df.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,4663,11002.0,150:20,1.0,20160528.0,
2,1439408,2632,8591.0,20:1,0.0,20160217.0,
3,1439408,2632,1078.0,20:1,0.0,20160319.0,
4,1439408,2632,8591.0,20:1,0.0,20160613.0,


#### 搭建两层Stacking模型，第一层模型由逻辑回归/随机森林/GBDT组成，第二层模型使用逻辑回归组合第一层模型。评价该Stacking模型，并与单模型质量对比

In [2]:
df['Discount_rate']=df['Discount_rate'].fillna(0)
df['Discount_rate_new']=df['Discount_rate'].apply(lambda x:x if ':' not in str(x) else ((float(x.split(':')[0])-float(x.split(':')[1]))/float(x.split(':')[0])))
df['Discount_rate_new']=df['Discount_rate_new'].apply(lambda x:round(float(x),2))
from sklearn.model_selection import train_test_split
df['label']=df['Date'].apply(lambda x:1 if x==x else 0)
y=df['label']
X=df
df['label']=df['Date'].apply(lambda x:1 if x==x else 0)
y=df['label']
selected_features=['Merchant_id','Coupon_id','Discount_rate_new','Distance']
x_train,x_test,y_train,y_test=train_test_split(X.loc[:,selected_features].fillna(0),y,test_size=0.3,random_state=0)




In [4]:
#组合模型
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import numpy as np

clfs = [LogisticRegression(),
       RandomForestClassifier(n_estimators=50, max_depth = 15, criterion='gini'),
       GradientBoostingClassifier(n_estimators=500, max_depth=8, learning_rate=0.1)]

skf = StratifiedKFold(10)

In [5]:
dataset_stacking_train = np.zeros((x_train.shape[0], len(clfs)))   
dataset_stacking_test = np.zeros((x_test.shape[0], len(clfs)))

In [6]:
np.zeros((x_test.shape[0], len(clfs))) #(几行几列)

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       ...,
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [7]:
np.zeros((x_test.shape[0],)) #(n列，1行)

array([0., 0., 0., ..., 0., 0., 0.])

In [8]:
#每个模型依次训练

for i, clf in enumerate(clfs):
    dataset_stacking_test_i = np.zeros((x_test.shape[0],))  
    for j, (train_index, test_index) in enumerate(skf.split(x_train, y_train)):
        X_train, Y_train, X_test, Y_test = x_train.iloc[train_index], y_train.iloc[train_index], x_train.iloc[test_index], y_train.iloc[test_index]
        clf.fit(X_train, Y_train)
        y_submission = clf.predict_proba(X_test)[:, 1]  #预测概率有两项，前一项为所有为0的概率，后一项为所有为1的概率
        dataset_stacking_train[test_index, i] = y_submission #找出每个模型下对应的训练集下交叉选择的测试集
        dataset_stacking_test_i = np.c_[dataset_stacking_test_i, clf.predict_proba(x_test)[:, 1]] #将np.c_行合并
    dataset_stacking_test_i = np.delete(dataset_stacking_test_i, 0, axis=1) #删除掉0索引列
    dataset_stacking_test[:, i] = dataset_stacking_test_i.mean(1) #得出每一列的平均值，返回一行多列
    print("val auc Score: %f" % roc_auc_score(y_test, dataset_stacking_test[:, i]))

val auc Score: 0.952832
val auc Score: 0.985474
val auc Score: 0.985126


In [9]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(dataset_stacking_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
y_submission = clf.predict_proba(dataset_stacking_test)[:,1]
print("Linear stretch of predictions to [0,1]")
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
print("blend result")
print("val auc Score: %f" % (roc_auc_score(y_test, y_submission)))

Linear stretch of predictions to [0,1]
blend result
val auc Score: 0.984639


In [11]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve, f1_score
import os
os.system('pip install lightgbm')
import lightgbm as lgb


In [12]:
#模型融合中使用到的单个模型
rf_params = {'n_estimators':5,
           'n_jobs' :-1,
           'criterion': 'entropy'}

lgb_params = {
    'learning_rate':0.1,
    'max_depth':4,
    'objective':'binary',
    'n_estimators':300
}

clfs = { LogisticRegression(),
        RandomForestClassifier(**rf_params),
        lgb.LGBMClassifier(**lgb_params)}

In [13]:
'''切分一部分数据作为测试集'''
dataset_stacking_train = np.zeros((x_train.shape[0], len(clfs)))
dataset_stacking_test = np.zeros((x_test.shape[0], len(clfs)))
'''5折stacking'''
n_folds = 5
skf = StratifiedKFold(n_folds)

In [15]:
for j, clf in enumerate(clfs):
    '''依次训练各个单模型'''
    print(j, clf)
    dataset_stacking_test_j = np.zeros((x_test.shape[0], len(x_train)), dtype='float16')#默认data type float64总是遇到空间不足的问题
    for i, (train_index, test_index) in enumerate(skf.split(x_train,y_train)):
        '''使用第i个部分作为预测，剩余的部分来训练模型，获得其预测的输出作为第i部分的新特征。'''
        print("Fold",i)
        fX_train, fy_train, fX_test, fy_test = x_train.iloc[train_index,:], y_train.iloc[train_index], x_train.iloc[test_index,:], y_train.iloc[test_index]
        clf.fit(fX_train, fy_train)
        y_submission = clf.predict_proba(fX_test)[:,1]
        dataset_stacking_train[test_index,j] = y_submission
        dataset_stacking_test_j[:,i] = clf.predict_proba(x_test)[:,1]
    '''对于测试集，直接用这k个模型的预测值均值作为新的特征。'''
    dataset_stacking_test[:, j] = dataset_stacking_test_j.mean(1)
    print("val auc Score: %f" % roc_auc_score(y_test, dataset_stacking_test[:, j]))
    #问题出在cup内存不足，

0 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
val auc Score: 0.952236
1 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=5, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
val auc Score: 0.982133
2 LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=4,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        

In [16]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(dataset_stacking_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
y_submission = clf.predict_proba(dataset_stacking_test)[:, 1]
print("Linear stretch of predictions to [0,1]")
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
print("blend result")
print("val auc Score: %f" % (roc_auc_score(y_test, y_submission)))

Linear stretch of predictions to [0,1]
blend result
val auc Score: 0.985206


#### 顺次使用每天的折扣券使用情况作为模型输入，使用逻辑回归增量学习每天的数据

In [18]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
X_train_scaled = scaler.transform(x_train)
X_test_scaled = scaler.transform(x_test)
reg = SGDRegressor()

#没有用时间增量，只是选取了X_train的部分数据试了一下增量学习
for _ in range(40):
    idx_list = list(range(len(X_train_scaled)))
    reg.partial_fit(X_train_scaled[idx_list[:100],:],y_train.values[idx_list[:100]])
    print(_,reg.score(X_train_scaled, y_train), reg.score(X_test_scaled, y_test))
    print('\t\t用SGD 进行预测得到的auc：',roc_auc_score(y_train,reg.predict(X_train_scaled)))

0 0.1997173361737552 0.19270561634491312
		用SGD 进行预测得到的auc： 0.9616141622882236
1 0.4703791915574348 0.4640943598970658
		用SGD 进行预测得到的auc： 0.9615008926086365
2 0.5936525076545913 0.5878315759159289
		用SGD 进行预测得到的auc： 0.9614662023058832
3 0.6602660172866648 0.6547581579663577
		用SGD 进行预测得到的auc： 0.961388518708937
4 0.7001941464072234 0.6949222204037162
		用SGD 进行预测得到的auc： 0.9613346915915212
5 0.7263010760014483 0.7212228372374424
		用SGD 进行预测得到的auc： 0.9613080212824937
6 0.7440615505526917 0.7391307155312152
		用SGD 进行预测得到的auc： 0.9612775554503644
7 0.7568537948259063 0.7520423892363454
		用SGD 进行预测得到的auc： 0.9612242338782422
8 0.7664444181312011 0.7617443885700022
		用SGD 进行预测得到的auc： 0.9612115490802152
9 0.7738576969779756 0.7692477574872236
		用SGD 进行预测得到的auc： 0.9612344447752619
10 0.7796642493550723 0.7751208830468006
		用SGD 进行预测得到的auc： 0.961187824473065
11 0.7843730689548587 0.7798891246985277
		用SGD 进行预测得到的auc： 0.9611887823592543
12 0.7883730850951647 0.7839481776279451
		用SGD 进行预测得到的auc： 0.9