# XGBoost Parameter Tuning for Rent Listing Inqueries

# 第六步：再次cv寻找最佳的参数n_estimators

首先 import 必要的模块

In [2]:
from xgboost import XGBClassifier
import xgboost as xgb

import pandas as pd 
import numpy as np

import math

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import log_loss

from matplotlib import pyplot
import seaborn as sns
%matplotlib inline

## 读取数据

In [3]:
# path to where the data lies
dpath = './data/'
train = pd.read_csv(dpath +"RentListingInquries_FE_train.csv")
test = pd.read_csv(dpath +"RentListingInquries_FE_test.csv")

In [4]:
Y = train['interest_level']
X = train.drop(["interest_level"], axis=1)
X = np.array(X)

In [5]:
X_train=X
y_train=Y

In [8]:
test.index

RangeIndex(start=0, stop=74659, step=1)

各类样本不均衡，交叉验证是采用StratifiedKFold，在每折采样时各类样本按比例采样

In [9]:
# prepare cross validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)

使用已经调整好的参数，再次调整弱学习器的数目

In [10]:
#直接调用xgboost内嵌的交叉验证（cv），可对连续的n_estimators参数进行快速交叉验证
#而GridSearchCV只能对有限个参数进行交叉验证
def modelfit(alg, X_train,y_train, cv_folds=None, early_stopping_rounds=10):
    xgb_param = alg.get_xgb_params()
    xgb_param['num_class'] = 3
    
    #直接调用xgboost，而非sklarn的wrapper类
    xgtrain = xgb.DMatrix(X_train, label = y_train)
        
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], folds =cv_folds,
             metrics='mlogloss', early_stopping_rounds=early_stopping_rounds,)
  
    cvresult.to_csv('1_nestimators.csv', index_label = 'n_estimators')
    
    #最佳参数n_estimators
    n_estimators = cvresult.shape[0]
    
    # 采用交叉验证得到的最佳参数n_estimators，训练模型
    alg.set_params(n_estimators = n_estimators)
    alg.fit(X_train,y_train,eval_metric='mlogloss')
    print(n_estimators)
        
    #Predict training set:
    train_predprob = alg.predict_proba(X_train)
    logloss = log_loss(y_train, train_predprob)

   #Print model report:
    print ("logloss of train :" )
    print(logloss)

In [12]:
#params = {"objective": "multi:softprob", "eval_metric":"mlogloss", "num_class": 9}
xgb1 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=1000,  #数值大没关系，cv会自动返回合适的n_estimators
        max_depth=6,
        min_child_weight=7,
        #gamma=0,
        reg_alpha=0.1,
        reg_lambda=0.5,
        subsample=0.8,
        colsample_bytree=0.8,
        colsample_bylevel=0.7,
        objective= 'multi:softprob',
        nthread=-1,
        seed=3)


modelfit(xgb1, X_train,y_train, cv_folds = kfold)

250
logloss of train :
0.47813485771025255


### 选择最佳 n_estimators=250

In [14]:
xgb1.feature_importances_

array([9.7049130e-03, 9.8839710e-03, 7.4273027e-02, 6.9259420e-02,
       9.0352386e-02, 1.1925226e-02, 1.1459677e-02, 0.0000000e+00,
       5.8730841e-03, 4.3117031e-02, 2.1307835e-02, 5.4827388e-02,
       4.5731269e-02, 8.4156999e-03, 3.2230339e-03, 6.1595761e-03,
       1.8263859e-03, 7.7710929e-03, 6.2670102e-03, 6.0521415e-03,
       3.3662799e-03, 3.9034523e-03, 4.6053573e-02, 1.1312849e-01,
       4.4513680e-02, 4.3940697e-02, 5.0171897e-02, 1.0743446e-04,
       3.5811488e-05, 1.4324595e-04, 7.1622977e-05, 7.1622977e-05,
       2.1128778e-03, 8.5947569e-04, 3.5811488e-05, 9.9197822e-03,
       3.9392637e-04, 2.5068043e-04, 1.1459676e-03, 1.7905745e-04,
       0.0000000e+00, 7.1622977e-05, 7.1622977e-05, 2.5426156e-03,
       0.0000000e+00, 1.0743446e-04, 0.0000000e+00, 1.0743446e-04,
       0.0000000e+00, 3.5811489e-04, 2.1486892e-04, 0.0000000e+00,
       3.9392637e-04, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 8.1650196e-03, 0.0000000e+00, 0.0000000e

In [15]:
type(X_train)

numpy.ndarray

In [4]:
test=np.array(test)
type(test)

numpy.ndarray

In [17]:
range(0,test.shape[0])

range(0, 74659)

### 在测试集上预测，并保存预测结果为：概率+id

In [18]:
preds=xgb1.predict_proba(test)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["id"] = range(0,test.shape[0])
out_df.to_csv("xgb_RentListingInqueries.csv", index=False)

### 将训练好的模型导出，方便后面使用

In [20]:
import pickle
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(xgb1, open(filename, 'wb'))

# some time later...

# load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)


In [5]:
import pickle
# load the model from disk
filename = 'finalized_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)

In [13]:
y_map = {2: 'low', 1:'medium',0: 'high'}
preds=loaded_model.predict(test)
preds= list(map(lambda x: y_map[x],preds))
#print(list(preds))
out_df = pd.DataFrame(preds)
out_df.columns = ["preds"]
out_df["id"] = range(0,test.shape[0])
out_df.to_csv("xgb_result_RentListingInqueries.csv", index=False)

  if diff:
