In [58]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from xgboost import plot_importance
%matplotlib inline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# 1.训练
## 1.1数据预处理
删除fnlwgt以及训练集中的`?`项, 并进行one-hot-encode, 对于test中的nan中默认忽略（get_dummies中dummy_na参数设置为False）

In [5]:
def one_hot_encoding(df, column_name):
    select = df[column_name] 
    one_hot_encode = pd.get_dummies(select)
    df = pd.concat([df, one_hot_encode], axis=1)
    df = df.drop(column_name, axis=1)
    return df

In [6]:
def score(yhat, y):
    return (yhat == y).sum() / len(yhat)

In [7]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')
train_and_test = [train, test]

In [8]:
continuous = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

In [9]:
for dframe in train_and_test:
    dframe = dframe.replace(' ?', np.NAN)
    dframe = dframe.drop('fnlwgt', axis=1)
    if (dframe.shape[0] > 30000):
        dframe = dframe.dropna()    
    discrete_feature = list(dframe.columns)
    for i in continuous:
        discrete_feature.remove(i)
        
    for i in discrete_feature:
        dframe = one_hot_encoding(dframe, i)
    if (dframe.shape[0] > 30000):
        train = dframe
    else:
        test = dframe

## 1.2 找出训练集
找出train_x, train_y, test_x并对其正规化

In [10]:
train_x = train.iloc[:, :-2]
train_y = train.iloc[:, -1] ## >50K
test_x = test

In [11]:
for i in continuous:
    temp_train = train_x[i]
    temp_test = test_x[i]
    train_x[i] = (temp_train - np.mean(temp_train)) / np.std(temp_train)
    test_x[i] = (temp_test - np.mean(temp_test)) / np.std(temp_test)

在train_x中选取test_x中出现的项

In [12]:
train_X = train_x.loc[:,list(test_x.columns)]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [13]:
XGB = XGBClassifier()
XGB.fit(train_X, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

# 2.测试

In [73]:
yhat = XGB.predict(test_x)
ans = pd.read_csv('correct_answer.csv')
y = ans.label
score(yhat, y)

  if diff:


0.865118850193477

# 3.改进

根据importance我们发现只有那些continous数据跟最后的收入关系较大

In [46]:
train_columns = np.array(train_x.columns)
select_columns = train_columns[XGB.feature_importances_ > 0.1]
select_columns

array(['age', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week'], dtype=object)

In [52]:
train_X = train_x.loc[:, select_columns]
train_X.head()

Unnamed: 0,age,education_num,capital_gain,capital_loss,hours_per_week
0,0.042796,1.128918,0.146092,-0.218586,-0.077734
1,0.880288,1.128918,-0.147445,-0.218586,-2.331531
2,-0.03334,-0.439738,-0.147445,-0.218586,-0.077734
3,1.108695,-1.224066,-0.147445,-0.218586,-0.077734
4,-0.794697,1.128918,-0.147445,-0.218586,-0.077734


In [56]:
test_X = test_x.loc[:, select_columns]
test_X.head()

Unnamed: 0,age,education_num,capital_gain,capital_loss,hours_per_week
0,-0.994129,-1.196864,-0.142662,-0.218062,-0.031432
1,-0.055417,-0.417886,-0.142662,-0.218062,0.769918
2,-0.777503,0.750582,-0.142662,-0.218062,-0.031432
3,0.377835,-0.028397,0.871091,-0.218062,-0.031432
4,-1.49959,-0.028397,-0.142662,-0.218062,-0.832781


In [57]:
XGB = XGBClassifier()
XGB.fit(train_X, train_y)
yhat = XGB.predict(test_X)
score(yhat, y)

  if diff:


0.831275720164609

这里所得到的结果反而变差了， 但是计算速度得到了提升， 所以我们采用一开始的模型，当然我们可以考虑去掉那些ｉｍｐｏｒｔａｎｃｅ＝０的项，这样得到的score应该会稍微高一点，同时计算效率也会有一定保证。

# 4.调参

In [63]:
train_x = train.iloc[:, :-2]
train_y = train.iloc[:, -1] ## >50K
test_x = test

for i in continuous:
    temp_train = train_x[i]
    temp_test = test_x[i]
    train_x[i] = (temp_train - np.mean(temp_train)) / np.std(temp_train)
    test_x[i] = (temp_test - np.mean(temp_test)) / np.std(temp_test)
    
train_X = train_x.loc[:,np.array(test_x.columns)]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [67]:
model = XGBClassifier()
learning_rate = [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2]
param_dict = dict(learning_rate=learning_rate)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_dict, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(train_X, train_y)

In [None]:
XGB = XGBClassifier(gamma=0.1, max_depth=5, min_child_weight=1, subsample=0.8, n_estimators=1000)
XGB.fit(train_X, train_y)
yhat = XGB.predict(test_x)
score(yhat, y)