In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold, train_test_split
from matplotlib import pyplot as plt
import config
import pickle
from DataLoader import FeatureDictionary, DataParser
from DCN import DCN
from metrics import Logloss

from sklearn.preprocessing import MinMaxScaler

def load_data():
    ## 读取特征矩阵
    with open('../data/temp.pkl', 'rb') as file:
        data = pickle.load(file)
    
    cols = [c for c in data.columns if c not in ['instance_id','click']]#提取特征集合
    cols = [c for c in cols if (not c in config.IGNORE_COLS)]
    
    ### 为了测量结果，使用Logloss衡量，且将训练集划分为：训练集+测试集+验证集(按照时间划分)
    ### period小于33的为训练集，period=33随即划分为验证集+测试集
    total_train = data[data.click != -1]
    
    #将数值特征归一化
    if config.NUMERIC_COLS != []:
        mms = MinMaxScaler()
        total_train[config.NUMERIC_COLS] = mms.fit_transform(total_train[config.NUMERIC_COLS])
    
    train = total_train[total_train.period <= 32][cols+['instance_id']]
    train_y = total_train[total_train.period <= 32]['click'] ##标签
    val_and_test = total_train[total_train.period == 33][cols+['instance_id']]
    val_and_test_y = total_train[total_train.period == 33]['click']
    val, test, val_y, test_y = train_test_split(val_and_test, val_and_test_y, test_size=0.5, random_state=1024)

    dfTrain_fea = pd.concat((train, val), axis = 0)
    dfTrain_y = pd.concat((train_y, val_y), axis = 0)
    dfTrain = pd.concat((dfTrain_fea, dfTrain_y), axis = 1)
    dfTest = pd.concat((test, test_y), axis = 1)



    X_train = dfTrain[cols].values
    y_train = dfTrain['click'].values

    X_test = dfTest[cols].values
    ids_test = dfTest['instance_id'].values

    cat_features_indices = [i for i,c in enumerate(cols) if c in config.CATEGORICAL_COLS]

    return dfTrain,dfTest,X_train,y_train,val, X_test,ids_test,cat_features_indices

# load data
dfTrain, dfTest, X_train, y_train, val, X_test, ids_test, cat_features_indices = load_data()

  from ._conv import register_converters as _register_converters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [6]:
dcn_params = {

    "embedding_size": 8,
    "deep_layers": [32, 32],
    "dropout_deep": [0.5, 0.5, 0.5],
    "deep_layers_activation": tf.nn.relu,
    "epoch": 30,
    "batch_size": 256,
    "learning_rate": 0.001,
    "optimizer_type": "adam",
    "batch_norm": 1,
    "batch_norm_decay": 0.995,
    "l2_reg": 0.01,
    "verbose": True,
    "random_seed": config.RANDOM_SEED,
    "cross_layer_num":3,
    "eval_metric":Logloss,
    'greater_is_better':False,
}

In [3]:
def run_base_model_dcn(dfTrain,dfTest,dcn_params):
    fd = FeatureDictionary(dfTrain,dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols = config.IGNORE_COLS)
    data_parser = DataParser(feat_dict= fd)
    # Xi_train ：列的序号
    # Xv_train ：列的对应的值
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, numeric_Xv_train,y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, numeric_Xv_test,ids_test = data_parser.parse(df=dfTest)

    dcn_params["cate_feature_size"] = fd.feat_dim
    dcn_params["field_size"] = len(Xi_train[0])
    dcn_params['numeric_feature_size'] = len(config.NUMERIC_COLS)

    print(dfTrain.dtypes)
    #将Xi_train分为训练集+验证集
    Xi_train_, Xv_train_, numeric_Xv_train_, y_train_ =  Xi_train[:-val.shape[0]], Xv_train[:-val.shape[0]],numeric_Xv_train[:-val.shape[0]], y_train[:-val.shape[0]]
    Xi_valid_, Xv_valid_, numeric_Xv_valid_, y_valid_ =  Xi_train[-val.shape[0]:], Xv_train[-val.shape[0]:], numeric_Xv_train[-val.shape[0]:], y_train[-val.shape[0]:]

    
    y_val_meta = np.zeros((val.shape[0],1),dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0],1),dtype=float)
    #开始训练
    dcn = DCN(**dcn_params)
    dcn.fit(Xi_train_, Xv_train_, numeric_Xv_train_,y_train_, Xi_valid_, Xv_valid_, numeric_Xv_valid_,y_valid_, early_stopping=True)
    y_val_meta[:,0] += dcn.predict(Xi_valid_, Xv_valid_, numeric_Xv_valid_)  #预测验证集
    losses = Logloss(y_valid_, y_val_meta[:,0])##验证集loss
    print('验证集loss为: %.4f' %losses)
    
    y_test_meta[:,0] += dcn.predict(Xi_test, Xv_test, numeric_Xv_test)  #预测测试集

    filename = "%s_loss%.4f.csv"%('DCN', losses)
    _make_submission(ids_test, y_test_meta, filename)
#     _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_test_meta

def _make_submission(ids, y_pred, filename="submission.csv"):
    pd.DataFrame({"instance_id": ids, "click": y_pred.flatten()}).to_csv(
        os.path.join(config.SUB_DIR, filename), index=False, float_format="%.5f")

In [9]:
# # ------------------ DCN Model ------------------
##y_train_dfm,y_test_dfm = run_base_model_dfm(dfTrain,dfTest,folds,dfm_params)
print('开始训练')
y_test_dfm = run_base_model_dcn(dfTrain, dfTest, dcn_params)


开始训练
adid                            int64
advert_industry_inner           int64
advert_name                     int64
app_cate_id                     int64
app_id                          int64
campaign_id                     int64
carrier                         int64
city                            int64
creative_has_deeplink           int64
creative_height                 int64
creative_id                     int64
creative_is_download            int64
creative_is_jump                int64
creative_tp_dnf                 int64
creative_type                   int64
creative_width                  int64
devtype                         int64
f_channel                       int64
inner_slot_id                   int64
nnt                             int64
orderid                         int64
os                              int64
province                        int64
sim_ip                          int64
clear_make                      int64
clear_model                     int64
clear_o

In [10]:
### 评判测试集的logloss
print(' 测试集loss为： %.4f' %Logloss(dfTest['click'].values, y_test_dfm[:,0]))

 测试集loss为： 0.4284
