In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold, train_test_split
from DataReader import FeatureDictionary, DataParser
from matplotlib import pyplot as plt
import config
import pickle
from DeepFM import DeepFM
from metrics import Logloss

from sklearn.preprocessing import MinMaxScaler

def load_data():
    ## 读取特征矩阵
    with open('../data/temp.pkl', 'rb') as file:
        data = pickle.load(file)
    
    cols = [c for c in data.columns if c not in ['instance_id','click']]#提取特征集合
    cols = [c for c in cols if (not c in config.IGNORE_COLS)]
    
    ### 为了测量结果，使用Logloss衡量，且将训练集划分为：训练集+测试集+验证集(按照时间划分)
    ### period小于33的为训练集，period=33随即划分为验证集+测试集
    total_train = data[data.click != -1]
    
    #将数值特征归一化
    if config.NUMERIC_COLS != []:
        mms = MinMaxScaler()
        total_train[config.NUMERIC_COLS] = mms.fit_transform(total_train[config.NUMERIC_COLS])
    
    train = total_train[total_train.period <= 32][cols+['instance_id']]
    train_y = total_train[total_train.period <= 32]['click'] ##标签
    val_and_test = total_train[total_train.period == 33][cols+['instance_id']]
    val_and_test_y = total_train[total_train.period == 33]['click']
    val, test, val_y, test_y = train_test_split(val_and_test, val_and_test_y, test_size=0.5, random_state=1024)

    dfTrain_fea = pd.concat((train, val), axis = 0)
    dfTrain_y = pd.concat((train_y, val_y), axis = 0)
    dfTrain = pd.concat((dfTrain_fea, dfTrain_y), axis = 1)
    dfTest = pd.concat((test, test_y), axis = 1)



    X_train = dfTrain[cols].values
    y_train = dfTrain['click'].values

    X_test = dfTest[cols].values
    ids_test = dfTest['instance_id'].values

    cat_features_indices = [i for i,c in enumerate(cols) if c in config.CATEGORICAL_COLS]

    return dfTrain,dfTest,X_train,y_train,val, X_test,ids_test,cat_features_indices

# load data
dfTrain, dfTest, X_train, y_train, val, X_test, ids_test, cat_features_indices = load_data()

  from ._conv import register_converters as _register_converters


In [2]:
dfm_params = {
    "use_fm":True,
    "use_deep":True,
    "embedding_size":8,
    "dropout_fm":[1.0,1.0],
    "deep_layers":[32,32],
    "dropout_deep":[0.5,0.5,0.5],
    "deep_layer_activation":tf.nn.relu,
    "epoch":30,
    "batch_size":256,
    "learning_rate":0.001,
    "optimizer":"adam",
    "batch_norm":1,
    "batch_norm_decay":0.995,
    "l2_reg":0.01,
    "verbose":True,
    "greater_is_better":False,
    "eval_metric":Logloss,
    "random_seed":config.RANDOM_SEED
}

In [3]:
def run_base_model_dfm(dfTrain,dfTest,dfm_params):
    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols = config.IGNORE_COLS)
    data_parser = DataParser(feat_dict= fd)
    # Xi_train ：列的序号
    # Xv_train ：列的对应的值
    Xi_train,Xv_train,y_train = data_parser.parse(df=dfTrain,has_label=True)
    Xi_test,Xv_test,ids_test = data_parser.parse(df=dfTest)

    print(dfTrain.dtypes)
    #将Xi_train分为训练集+验证集
    Xi_train_, Xv_train_, y_train_ =  Xi_train[:-val.shape[0]], Xv_train[:-val.shape[0]], y_train[:-val.shape[0]]
    Xi_valid_, Xv_valid_, y_valid_ =  Xi_train[-val.shape[0]:], Xv_train[-val.shape[0]:], y_train[-val.shape[0]:]

    dfm_params['feature_size'] = fd.feat_dim
    dfm_params['field_size'] = len(Xi_train[0])
    
    y_val_meta = np.zeros((val.shape[0],1),dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0],1),dtype=float)
    #开始训练
    dfm = DeepFM(**dfm_params)
    dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_, early_stopping=True)
    y_val_meta[:,0] += dfm.predict(Xi_valid_, Xv_valid_)  #预测验证集
    losses = Logloss(y_valid_, y_val_meta[:,0])##验证集loss
    print('验证集loss为: %.4f' %losses)
    
    y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)  #预测测试集

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"
    filename = "%s_loss%.4f.csv"%(clf_str, losses)
    _make_submission(ids_test, y_test_meta, filename)
#     _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_test_meta

def _make_submission(ids, y_pred, filename="submission.csv"):
    pd.DataFrame({"instance_id": ids, "click": y_pred.flatten()}).to_csv(
        os.path.join(config.SUB_DIR, filename), index=False, float_format="%.5f")

In [4]:
# # ------------------ DeepFM Model ------------------
##y_train_dfm,y_test_dfm = run_base_model_dfm(dfTrain,dfTest,folds,dfm_params)
y_test_dfm = run_base_model_dfm(dfTrain, dfTest, dfm_params)


# # ------------------ FM Model ------------------
# fm_params = dfm_params.copy()
# fm_params["use_deep"] = False
# y_train_fm, y_test_fm = run_base_model_dfm(dfTrain, dfTest, fm_params)


# # ------------------ DNN Model ------------------
# dnn_params = dfm_params.copy()
# dnn_params["use_fm"] = False
# y_train_dnn, y_test_dnn = run_base_model_dfm(dfTrain, dfTest, folds, dnn_params)

[ 553   98  190 ... 2065 2053 1907]
[17 11 16 21  6  0 18 22 19  5 12 14  8 13 10  3 20  4  1  7  9 24 23 15]
[25 32  0 20  6 14 11 12  8 10 19 30 13 18 33 27  9 26 16 17 23 28  3  7
 15  4 22 29  5 34 31 24  2 21]
[ 8  9  1  4 12  6 20 10 16  3 22 11 14  2 21  5 18  0 19  7 13 17 15]
[233  17 255 157   5 162 138 117  67 131   7 240  28  46 427 122  14 197
 158 305 103 165 151 208  29  45   4  68 262 166 153 234 124 121  30 227
 426 421  91 407  12 155 111 133 125  95  47 132  66  76 126 436 437 107
  65 242  42  73  23  96 120  22 221 239 429   8   6  88 139  31 236  52
   3 163 209  54  74 241   0 251   2 285 146  59 154 167 250 164  79 195
 318 142 152  86 246 266 137 264 196 245  75  15 235 418 150 269 410 311
 223 237  18 413  84 114 271 123 253 189 263  93 175 433 260   9  56 261
 259 160 247 193  38 109 243 102 287  19  77 187 130  11  63 423 301 135
  39  21  80 159  25 129 118 113 330 293 136  44 238  72  13  55  41 145
  82 415 416 291  24  43 289 156 304 254 336  99 203  26 

adid                       int64
advert_industry_inner      int64
advert_name                int64
app_cate_id                int64
app_id                     int64
campaign_id                int64
carrier                    int64
city                       int64
creative_has_deeplink      int64
creative_height            int64
creative_id                int64
creative_is_download       int64
creative_is_jump           int64
creative_tp_dnf            int64
creative_type              int64
creative_width             int64
devtype                    int64
f_channel                  int64
inner_slot_id              int64
nnt                        int64
orderid                    int64
os                         int64
province                   int64
sim_ip                     int64
clear_make                 int64
clear_model                int64
clear_osv                  int64
instance_id                int64
click                    float64
dtype: object
#params: 68827
[1] train-resu

In [5]:
### 评判测试集的logloss
# test_pred = pd.read_csv('./output/DeepFM_loss0.4311.csv', index_col = False)
print(' 测试集loss为： %.4f' %Logloss(dfTest['click'].values, y_test_dfm[:,0]))

 测试集loss为： 0.4276
