# 以最优参数训练所有数据

In [1]:
from xgboost import XGBClassifier
import xgboost as xgb

import pandas as pd 
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import log_loss

from matplotlib import pyplot
import seaborn as sns
%matplotlib inline

## 读取数据

In [2]:
# path to where the data lies
dpath = './data/'
train = pd.read_csv(dpath +"RentListingInquries_FE_train.csv")

## 准备数据

In [3]:
y_train = train['interest_level']
train = train.drop(['interest_level'], axis=1, inplace = False)
X_train = train

In [4]:
#直接调用xgboost内嵌的交叉验证（cv），可对连续的n_estimators参数进行快速交叉验证
#而GridSearchCV只能对有限个参数进行交叉验证
def modelfit(alg, X_train, y_train, cv_folds=3, early_stopping_rounds=10):
    xgb_param = alg.get_xgb_params()
    xgb_param['num_class'] = 3
    
    #直接调用xgboost，而非sklarn的wrapper类
    xgtrain = xgb.DMatrix(X_train, label = y_train)
        
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], folds =cv_folds,
             metrics='mlogloss', early_stopping_rounds=early_stopping_rounds)
  
    cvresult.to_csv('6_nestimators.csv', index_label = 'n_estimators')
    
    #最佳参数n_estimators
    n_estimators = cvresult.shape[0]
    
    # 采用交叉验证得到的最佳参数n_estimators，训练模型
    alg.set_params(n_estimators = n_estimators)
    alg.fit(X_train, y_train, eval_metric='mlogloss')
        
    #Predict training set:
    train_predprob = alg.predict_proba(X_train)
    logloss = log_loss(y_train, train_predprob)

   #Print model report:
    print ('logloss of train is:', logloss)

In [5]:
#params = {"objective": "multi:softprob", "eval_metric":"mlogloss", "num_class": 9}
xgb6 = XGBClassifier(
        learning_rate =0.02,
        n_estimators=103,  #数值大没关系，cv会自动返回合适的n_estimators
        max_depth=6,
        min_child_weight=3,
        gamma=0,
        subsample = 0.7,
        colsample_bytree=0.8,
        colsample_bylevel=0.7,
        reg_alpha = 2,
        reg_lambda = 2,
        objective= 'multi:softprob',
        seed=3)

modelfit(xgb6, X_train, y_train)

logloss of train is: 0.6610158368468666


## 保存模型，供测试使用

In [7]:
#保存模型
import pickle
pickle.dump(xgb6, open("xgb_model.pkl", 'wb'))

In [8]:
#保存数据
import pickle

xgb = pickle.load(open("xgb_model.pkl", 'rb'))

train_predprob = xgb.predict_proba(X_train)
logloss = log_loss(y_train, train_predprob)

#Print model report:
print ('logloss of train is:', logloss)

logloss of train is: 0.6610158368468666
