In [1]:
import datetime
import os
import time
from concurrent.futures import ProcessPoolExecutor
from math import ceil

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve




In [2]:
def get_processed_data():
    dataset1 = pd.read_csv('data_preprocessed_2/ProcessDataSet1.csv')
    dataset2 = pd.read_csv('data_preprocessed_2/ProcessDataSet2.csv')
    dataset3 = pd.read_csv('data_preprocessed_2/ProcessDataSet3.csv')
    # 去重
    dataset1.drop_duplicates(inplace=True) 
    dataset2.drop_duplicates(inplace=True)
    dataset3.drop_duplicates(inplace=True)
    dataset12 = pd.concat([dataset1, dataset2], axis=0)
    # 空值填为0
    dataset12.fillna(0, inplace=True)
    dataset3.fillna(0, inplace=True)
    return dataset12, dataset3
dataset12, dataset3 = get_processed_data()

In [3]:
cpu_jobs = os.cpu_count() - 1

In [20]:
# 性能评价函数
def myauc(test):
    testgroup = test.groupby(['Coupon_id'])
    aucs = []
    for i in testgroup:
        tmpdf = i[1]
        if len(tmpdf['label'].unique()) != 2:
            continue
        fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred'], pos_label=1)
        aucs.append(auc(fpr, tpr))
    return np.average(aucs)


def train_xgb(dataset12, dataset3):

    predict_dataset = dataset3[['User_id', 'Coupon_id', 'Date_received']].copy()
    predict_dataset.Date_received = pd.to_datetime(predict_dataset.Date_received, format='%Y-%m-%d')
    predict_dataset.Date_received = predict_dataset.Date_received.dt.strftime('%Y%m%d')

    # 将数据转化为dmatric格式
    dataset12_x = dataset12.drop(
        columns=['User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
                 'Date', 'Coupon_id', 'label'], axis=1) # 删除表头
    
    dataset3_x = dataset3.drop(
        columns=['User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
                 'Coupon_id'], axis=1) # 删除表头

    train_dmatrix = xgb.DMatrix(dataset12_x, label=dataset12.label) #转为dmatric格式
    predict_dmatrix = xgb.DMatrix(dataset3_x) 

    # xgboost模型训练
    params = {'booster': 'gbtree',
              'objective': 'binary:logistic',
              'eval_metric': 'auc',
              'gamma': 0.1,
              'min_child_weight': 3.1,
              'max_depth': 8,
              'lambda': 10,
              'subsample': 0.9,
              'colsample_bytree': 0.7,
              'colsample_bylevel': 0.7,
              'eta': 0.01,
              'tree_method': 'gpu_hist',
              'seed': 1024,
              'nthread': cpu_jobs,
              'predictor': 'gpu_predictor'
              }

    # 使用xgb.cv优化num_boost_round参数
    #cvresult = xgb.cv(params, train_dmatrix, num_boost_round=10000, nfold=5, metrics='auc', seed=123123, early_stopping_rounds=50)
    
    num_round_best =9999 #cvresult.shape[0] - 1
    print('Best round num: ', num_round_best)

    
    # 使用优化后的num_boost_round参数训练模型
    watchlist = [(train_dmatrix, 'train')]
    
    
    model = xgb.train(params, train_dmatrix, num_boost_round=num_round_best, evals=watchlist)

    model.save_model('train_dir_2/xgbmodel')
    params['predictor'] = 'gpu_predictor'
    model = xgb.Booster(params)
    model.load_model('train_dir_2/xgbmodel')

    # predict test set
    dataset3_predict = predict_dataset.copy()
    dataset3_predict['label'] = model.predict(predict_dmatrix)

    # 标签归一化
    dataset3_predict.label = MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(
        dataset3_predict.label.values.reshape(-1, 1))
    dataset3_predict.sort_values(by=['Coupon_id', 'label'], inplace=True)
    dataset3_predict.to_csv("train_dir_2/xgb_preds.csv", index=None, header=None)
    

    temp = dataset12[['Coupon_id', 'label']].copy()
    temp['pred'] = model.predict(xgb.DMatrix(dataset12_x))
    temp.pred = MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(temp['pred'].values.reshape(-1, 1))
    print(myauc(temp))

train_xgb(dataset12, dataset3)

Best round num:  9999
[0]	train-auc:0.85386
[1]	train-auc:0.86489
[2]	train-auc:0.87238
[3]	train-auc:0.87432
[4]	train-auc:0.87540
[5]	train-auc:0.87738
[6]	train-auc:0.87756
[7]	train-auc:0.87829
[8]	train-auc:0.87863
[9]	train-auc:0.87875
[10]	train-auc:0.87877
[11]	train-auc:0.87985
[12]	train-auc:0.88033
[13]	train-auc:0.88040
[14]	train-auc:0.88055
[15]	train-auc:0.88057
[16]	train-auc:0.88099
[17]	train-auc:0.88170
[18]	train-auc:0.88203
[19]	train-auc:0.88221
[20]	train-auc:0.88217
[21]	train-auc:0.88270
[22]	train-auc:0.88274
[23]	train-auc:0.88294
[24]	train-auc:0.88298
[25]	train-auc:0.88351
[26]	train-auc:0.88360
[27]	train-auc:0.88362
[28]	train-auc:0.88377
[29]	train-auc:0.88380
[30]	train-auc:0.88383
[31]	train-auc:0.88380
[32]	train-auc:0.88389
[33]	train-auc:0.88416
[34]	train-auc:0.88435
[35]	train-auc:0.88436
[36]	train-auc:0.88465
[37]	train-auc:0.88463
[38]	train-auc:0.88475
[39]	train-auc:0.88498
[40]	train-auc:0.88504
[41]	train-auc:0.88507
[42]	train-auc:0.88524

In [5]:
predict_dataset = dataset3[['User_id', 'Coupon_id', 'Date_received']].copy()
predict_dataset.Date_received = pd.to_datetime(predict_dataset.Date_received, format='%Y-%m-%d')
predict_dataset.Date_received = predict_dataset.Date_received.dt.strftime('%Y%m%d')

# 将数据转化为dmatric格式
dataset12_x = dataset12.drop(
    columns=['User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
                'Date', 'Coupon_id', 'label'], axis=1) # 删除表头

dataset3_x = dataset3.drop(
    columns=['User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
                'Coupon_id'], axis=1) # 删除表头

train_dmatrix = xgb.DMatrix(dataset12_x, label=dataset12.label) #转为dmatric格式

In [None]:
params = {'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'gamma': 0.1,
        'min_child_weight': 1.1,
        'max_depth': 5,
        'lambda': 10,
        'subsample': 0.9,
        'colsample_bytree': 0.7,
        'colsample_bylevel': 0.7,
        'eta': 0.01,
        'tree_method': 'gpu_hist',
        'seed': 1024,
        'nthread': cpu_jobs,
        'predictor': 'gpu_predictor'
        }
90.7

In [18]:

params = {'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'gamma': 0.1,
        'min_child_weight': 1.1,
        'max_depth': 5,
        'lambda': 10,
        'subsample': 0.9,
        'colsample_bytree': 0.7,
        'colsample_bylevel': 0.7,
        'eta': 0.01,
        'tree_method': 'gpu_hist',
        'seed': 1024,
        'nthread': cpu_jobs,
        'predictor': 'gpu_predictor'
        }
for subsample in [0.9]:
        for max_depth in [6,7,8,9]:
                for min_child_weight in [3.1,4.1,5.5]:
                        params["min_child_weight"]=min_child_weight
                        params["max_depth"]=max_depth
                        print(subsample,max_depth,min_child_weight)
                        params["subsample"]=subsample
                        #params["learning_rate"]=0.3
                        cvresult = xgb.cv(params, train_dmatrix, num_boost_round=10000, nfold=5, metrics='auc', seed=123123, early_stopping_rounds=50)
                        print(cvresult.values[-1])

0.9 6 3.1
[9.38607460e-01 4.88377697e-04 9.08253977e-01 1.66692136e-03]
0.9 6 4.1
[9.38061527e-01 4.59441101e-04 9.08149167e-01 1.65773179e-03]
0.9 6 5.5
[9.37387561e-01 4.82101801e-04 9.08133654e-01 1.60140332e-03]
0.9 7 3.1
[9.41349185e-01 5.39836223e-04 9.08737884e-01 1.59772713e-03]
0.9 7 4.1
[9.40570868e-01 5.54597452e-04 9.08688473e-01 1.58903224e-03]
0.9 7 5.5
[9.40702100e-01 5.15370829e-04 9.08689784e-01 1.61655988e-03]
0.9 8 3.1
[9.45161565e-01 4.05866622e-04 9.09079327e-01 1.54544143e-03]
0.9 8 4.1
[9.44078950e-01 4.48849124e-04 9.09063018e-01 1.54182146e-03]
0.9 8 5.5


KeyboardInterrupt: 

In [9]:
cvresult

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.826908,0.001787,0.826187,0.004729
1,0.845996,0.000988,0.845106,0.004305
2,0.865333,0.001987,0.864594,0.002584
3,0.867544,0.001753,0.866437,0.002743
4,0.868978,0.001421,0.867868,0.002387
...,...,...,...,...
348,0.931631,0.000511,0.904971,0.001837
349,0.931679,0.000521,0.904973,0.001837
350,0.931725,0.000521,0.904983,0.001832
351,0.931769,0.000509,0.904992,0.001830


In [None]:
predict_dataset = dataset3[['User_id', 'Coupon_id', 'Date_received']].copy()
predict_dataset.Date_received = pd.to_datetime(predict_dataset.Date_received, format='%Y-%m-%d')
predict_dataset.Date_received = predict_dataset.Date_received.dt.strftime('%Y%m%d')

# 将数据转化为dmatric格式
dataset12_x = dataset12.drop(
    columns=['User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
                'Date', 'Coupon_id', 'label'], axis=1)