In [0]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

## 学習

In [4]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,52,services,married,secondary,no,108,yes,no,unknown,15,may,543,12,-1,0,unknown,no
1,35,admin.,married,unknown,no,1055,no,no,cellular,2,mar,59,2,-1,0,unknown,no
2,37,blue-collar,married,secondary,no,32,yes,no,unknown,20,may,146,2,-1,0,unknown,no
3,57,unemployed,married,primary,no,2743,no,no,cellular,29,jan,89,1,-1,0,unknown,no
4,42,self-employed,married,tertiary,no,6383,no,no,cellular,27,aug,111,5,-1,0,unknown,no


## 前処理

In [0]:
import datetime

In [0]:
def get_features(df):
    df['month'] = df['month'].replace({
        'jan': 1,
        'feb': 2,
        'mar': 3, 
        'apr': 4,
        'may': 5,
        'jun': 6,
        'jul': 7,
        'aug': 8,
        'sep': 9,
        'oct': 10,
        'nov': 11,
        'dec': 12,
    }).values
    
    df['yday'] = [str(datetime.datetime(2000, month, day).timetuple().tm_yday) for month, day in zip(df['month'].values, df['day'].values)]

    # 特徴量として使う連続量
    features_ana = ['age','balance', 'duration', 'campaign', 'pdays', 'previous', 'yday']
    # 特徴量として使うカテゴリ値
    features_dig = ['job', 'marital', 'education', 'default','housing', 'loan', 'contact', 'poutcome']
    X = np.concatenate(
        [df[features_ana].values, pd.get_dummies(df[features_dig]).values],
        axis=1)
    return X

def get_label(df):
    y = df['y'].replace({'yes': 1, 'no': 0}).values
    return y

In [0]:
# 特徴量, ラベルを作成
X = get_features(df)
y = get_label(df)

### グリッドサーチ

In [0]:
mod1J1 =GradientBoostingRegressor(
    min_samples_split = 5, 
    min_samples_leaf = 50, 
    max_depth = 5, 
    max_features = 'sqrt', 
    subsample = 0.8)
paramJ1_1 = {'n_estimators': list(range(20, 101, 10)),
             'learning_rate': list(np.arange(0.05, 0.20, 0.01))}

gsearch1 = GridSearchCV(estimator = mod1J1, 
                        param_grid = paramJ1_1, 
                        cv = 5, 
                        n_jobs=4, 
                        scoring = 'neg_mean_squared_error')
gsearch1.fit(X, y)
# test精度の平均が最も高かった組み合わせを出力
print(gsearch1.best_params_)

In [8]:
clf = GradientBoostingClassifier(
    n_estimators = 90,
    learning_rate = 0.13,
    min_samples_split = 6, 
    max_depth = 6, 
    min_samples_leaf = 61, 
    max_features = 9, 
    subsample = 0.8,
    random_state=0)
clf.fit(X, y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.13, loss='deviance', max_depth=6,
                           max_features=9, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=61, min_samples_split=6,
                           min_weight_fraction_leaf=0.0, n_estimators=90,
                           n_iter_no_change=None, presort='auto',
                           random_state=0, subsample=0.8, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [9]:
clf.score(X, ｙ) 

0.9226572292265723

## テスト

In [0]:
test_df = pd.read_csv('test.csv')

In [0]:
test_X = get_features(test_df)
test_y = get_label(test_df)

In [12]:
# test ARI (この値を報告してください)
clf.score(test_X, test_y) 

0.9124281291463954