In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

In [2]:
#читаем данные
features = pd.read_csv('train_data.csv')
targets = pd.read_csv('train_target.csv',header=None)[0]
testfeatures = pd.read_csv('test_data.csv',index_col=0)
features.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,26,student,single,high.school,no,no,no,telephone,jun,mon,901,1,999,0,nonexistent,1.4,94.465,-41.8,4.961,5228.1
1,46,admin.,married,university.degree,no,yes,no,cellular,aug,tue,208,2,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1
2,49,blue-collar,married,basic.4y,unknown,yes,yes,telephone,jun,tue,131,5,999,0,nonexistent,1.4,94.465,-41.8,4.864,5228.1
3,31,technician,married,university.degree,no,no,no,cellular,jul,tue,404,1,999,0,nonexistent,-2.9,92.469,-33.6,1.044,5076.2
4,42,housemaid,married,university.degree,no,yes,no,telephone,nov,mon,85,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8


In [3]:
#функция для кодирования категориальных признаков

def onehotencode(features):
    fnum = features.iloc[:,:]
    for i in list(range(1,10)) + [14]:
        colname = list(features)[i]
        oh = pd.get_dummies(features.iloc[:,i],prefix=colname)
        fnum = fnum.drop(colname, axis=1)
        fnum = pd.concat([fnum,oh],axis=1)
    return fnum

In [4]:
#соединяем обучающие и тестовые данные в один датафрейм, чтобы преобразовать признаки единообразно
alldata = pd.concat([features,testfeatures], keys=['features', 'testfeatures'])

#преобразовываем
alldata = onehotencode(alldata)

#делим на обучающую и тестовую обратно
fnum = alldata.loc['features']
ftest = alldata.loc['testfeatures']
ftest.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,49,126,1,999,0,1.4,93.444,-36.1,4.968,5228.1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,31,1099,2,999,0,1.4,93.444,-36.1,4.965,5228.1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,36,407,1,999,0,1.4,93.918,-42.7,4.96,5228.1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,26,109,1,999,0,1.4,93.918,-42.7,4.962,5228.1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,41,147,1,999,0,1.4,94.465,-41.8,4.961,5228.1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [5]:
#делим обучающие данные на тренировочную и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(fnum, targets, random_state=0, test_size=0.2, stratify=targets)

In [6]:
#смотрим на скор градиентного бустинга по обучающей выборке
m = GradientBoostingClassifier(n_estimators=100, loss='exponential', max_depth=4, random_state=0).fit(X_train,y_train)
roc_auc_score(y_test,m.predict_proba(X_test)[:,1])

0.9486098980790818

In [8]:
#обучить по всей обучающей и записать результаты для тестовой - градиентный бустинг
m = GradientBoostingClassifier(n_estimators=100, loss='exponential', max_depth=4, random_state=0).fit(fnum, targets)
print(roc_auc_score(targets,m.predict_proba(fnum)[:,1]))
ypred = pd.DataFrame(m.predict_proba(ftest)[:,1],columns=['Prediction'])
ypred.to_csv('out_gbc.csv',index_label='Id')

0.958242392741
