In [1]:
import numpy as np
import pandas as pd
import os
import xgboost as xgb

from sklearn.tree import DecisionTreeClassifier 
from sklearn.cross_validation import train_test_split

#记录程序运行时间
import time 
start_time = time.time()



In [2]:
os.chdir('/Users/Evan/Kaggle')

In [3]:
data_train = pd.read_csv('./Credit/data_train.csv')
data_test = pd.read_csv('./Credit/data_test.csv')

In [4]:
data_select_train = data_train.drop(['Unnamed: 0','timestamp_money','sum_total','mean_total','freq_total','sum_income','mean_income','freq_income','sum_spend','mean_spend','freq_spend'],axis = 1)
data_select_trial = data_test.drop(['Unnamed: 0','timestamp_money','sum_total','mean_total','freq_total','sum_income','mean_income','freq_income','sum_spend','mean_spend','freq_spend'],axis = 1)

In [5]:
data_select_train.isnull().sum()/len(data_train)

ID                     0.000000
gender                 0.000000
career                 0.000000
education              0.000000
marriage               0.000000
hukou                  0.000000
Label                  0.000000
bill_credit            0.119811
credit_line            0.043564
cash_advance           0.043564
amount_transactions    0.043564
browser1               0.148680
browser2               0.148680
browser3               0.148680
browser4               0.148680
browser5               0.148680
browser6               0.148680
browser7               0.148680
browser8               0.148680
browser9               0.148680
browser10              0.148680
browser11              0.148680
dtype: float64

In [6]:
X = data_select_train.drop(['Label','ID'],axis =1)

In [7]:
y = data_select_train['Label']

In [10]:
data_select_trial.columns

Index(['ID', 'gender', 'career', 'education', 'marriage', 'hukou', 'Label',
       'bill_credit', 'credit_line', 'cash_advance', 'amount_transactions',
       'browser1', 'browser2', 'browser3', 'browser4', 'browser5', 'browser6',
       'browser7', 'browser8', 'browser9', 'browser10', 'browser11'],
      dtype='object')

In [11]:
data_select_test = data_select_trial.drop(['ID','Label'],axis =1)

## 缺失值处理

In [12]:
X = X.fillna(-999)
y = y.fillna(-999)
data_select_test = data_select_test.fillna(-999)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3,random_state=161214)

In [14]:
y_train

23479    0
38341    0
30204    0
35899    1
41507    0
48672    0
17916    0
6955     0
611      0
1555     0
14519    0
8193     0
14629    0
32161    0
17986    0
29196    0
34300    0
18402    0
30874    0
10978    0
34989    0
38228    0
29343    0
38579    0
14457    0
38575    0
23877    0
13703    0
27941    0
47729    0
        ..
25718    0
30415    0
14016    0
20987    1
50426    1
40166    0
22849    0
17348    0
15771    0
23241    0
23783    0
16563    0
24446    1
55022    0
11861    0
15083    0
4079     0
10734    0
47193    0
35714    0
18054    0
705      0
51155    0
5196     0
43564    0
33385    0
21372    0
18435    0
28786    1
34858    0
Name: Label, dtype: int64

In [15]:
xgb_val = xgb.DMatrix(X_test,label=y_test,missing = -999)
xgb_train = xgb.DMatrix(X_train,label=y_train,missing = -999)
xgb_test = xgb.DMatrix(data_select_test,missing = -999)

In [16]:
params = {'max_depth':4, 
         'eta':0.5, 
         'objective':'binary:logistic', 
         'eval_metric': 'auc',
         'subsample':0.7, # 随机采样训练样本
         'gamma':0.1,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
         'lambda':2,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
         'colsample_bytree':0.7, # 生成树时进行的列采样
         'min_child_weight':3,
         'silent':1,
         'seed':1000,
         'nthread':7,# cpu 线程数
         }


plst = list(params.items())

num_rounds = 5000 # 迭代次数

watchlist = [(xgb_train,'train'),(xgb_val,'eval')]

#训练模型并保存
# early_stopping_rounds 当设置的迭代次数较大时，early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
model = xgb.train(plst, xgb_train, num_rounds, watchlist,early_stopping_rounds=100)
model.save_model('./xgb.model') # 用于存储训练出的模型
print("best best_ntree_limit",model.best_ntree_limit)

[0]	train-auc:0.631462	eval-auc:0.615938
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 100 rounds.
[1]	train-auc:0.660177	eval-auc:0.642262
[2]	train-auc:0.666315	eval-auc:0.647232
[3]	train-auc:0.672387	eval-auc:0.656745
[4]	train-auc:0.679589	eval-auc:0.660701
[5]	train-auc:0.683187	eval-auc:0.664407
[6]	train-auc:0.688923	eval-auc:0.666429
[7]	train-auc:0.691904	eval-auc:0.667933
[8]	train-auc:0.699098	eval-auc:0.675054
[9]	train-auc:0.708072	eval-auc:0.680851
[10]	train-auc:0.710892	eval-auc:0.679471
[11]	train-auc:0.71304	eval-auc:0.677868
[12]	train-auc:0.716623	eval-auc:0.677524
[13]	train-auc:0.722207	eval-auc:0.681345
[14]	train-auc:0.724161	eval-auc:0.681602
[15]	train-auc:0.726012	eval-auc:0.681239
[16]	train-auc:0.728269	eval-auc:0.681663
[17]	train-auc:0.730708	eval-auc:0.683752
[18]	train-auc:0.733924	eval-auc:0.682959
[19]	train-auc:0.734907	eval-auc:0.683601
[20]	train-auc:0.737499	eval-

In [21]:
preds = model.predict(xgb_test,ntree_limit=model.best_ntree_limit)

In [None]:
preds = model.predict(xgb_test,ntree_limit=model.best_ntree_limit)
np.savetxt('xgb_submission.csv',np.c_[range(1,len(data_select_test)+1),preds],delimiter=',',header='ImageId,Label',comments='',fmt='%d')

#输出运行时长
cost_time = time.time()-start_time
print("xgboost success!",'\n',"cost time:",cost_time,"(s)......")

In [76]:
data_submit = pd.DataFrame([data_select_trial['ID'],pd.Series(preds)]).T

In [81]:
data_submit['ID']= data_submit['ID'].astype('int')

In [84]:
data_submit.columns = ['userid','probability']

In [88]:
data_submit.to_csv('/Users/Evan/Desktop/submit.csv',index=False)