In [57]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split

In [58]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [59]:
from scipy.stats import randint as randint
from scipy.stats import uniform

try:
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.model_selection import StratifiedKFold
except ImportError:
    from sklearn.cross_validation import GridSearchCV
    from sklearn.cross_validation import RandomizedSearchCV
    from sklearn.cross_validation import StratifiedKFold


RND_SEED = 123

In [60]:
from sklearn.metrics import roc_auc_score, roc_curve

In [61]:
#"Считаем" данные из файла в dataFrame:
data_frame_train = pd.read_csv('train.csv')

In [62]:
#Размер таблицы:
data_frame_train.shape

(24712, 22)

In [63]:
#Информация о таблице
data_frame_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24712 entries, 0 to 24711
Data columns (total 22 columns):
_id               24712 non-null object
age               24712 non-null int64
job               24712 non-null object
marital           24712 non-null object
education         24712 non-null object
default           24712 non-null object
housing           24712 non-null object
loan              24712 non-null object
contact           24712 non-null object
month             24712 non-null object
day_of_week       24712 non-null object
duration          24712 non-null int64
campaign          24712 non-null int64
pdays             24712 non-null int64
previous          24712 non-null int64
poutcome          24712 non-null object
emp.var.rate      24712 non-null float64
cons.price.idx    24712 non-null float64
cons.conf.idx     24712 non-null float64
euribor3m         24712 non-null float64
nr.employed       24712 non-null float64
target            24712 non-null int64
dtypes: floa

In [64]:
#Содержание таблицы
data_frame_train.head()

Unnamed: 0,_id,age,job,marital,education,default,housing,loan,contact,month,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,target
0,df7489733b004bbe40d3d37b34f82419,54,technician,married,professional.course,unknown,no,no,cellular,aug,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
1,905a0b9a5456ee962223033473666be3,36,services,single,basic.4y,no,no,no,telephone,may,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,02d69c07ec6d68e31d641edd45ce61cd,53,blue-collar,married,basic.4y,no,yes,no,cellular,aug,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.965,5228.1,0
3,e492fbe0cccaf67cdb5c0944dbc8a167,34,blue-collar,married,basic.9y,no,yes,yes,telephone,may,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,ac2dc6f5835bdea8e71dd99b65208d07,29,services,single,university.degree,no,yes,no,telephone,aug,...,1,999,0,nonexistent,-1.7,94.027,-38.3,0.89,4991.6,0


In [65]:
#Категориальные признаки:
categorical_columns = [c for c in data_frame_train.columns if data_frame_train[c].dtype.name == 'object']
categorical_columns

['_id',
 'job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'poutcome']

In [66]:
#информация по категориальным признакам:
data_frame_train[categorical_columns].describe()

Unnamed: 0,_id,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
count,24712,24712,24712,24712,24712,24712,24712,24712,24712,24712,24712
unique,24712,12,4,8,3,3,3,2,10,5,3
top,cf21aaa06c046549ceeac62b33f59392,admin.,married,university.degree,no,yes,no,cellular,may,thu,nonexistent
freq,1,6254,14871,7309,19567,12934,20334,15729,8199,5144,21346


In [67]:
#Некоторые статистические данные таблицы:
data_frame_train.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,target
count,24712.0,24712.0,24712.0,24712.0,24712.0,24712.0,24712.0,24712.0,24712.0,24712.0,24712.0
mean,40.030188,258.345298,2.580163,963.558797,0.1719,0.082713,93.576386,-40.516077,3.619397,5166.988111,0.112779
std,10.462301,261.573344,2.776757,184.224885,0.491511,1.5723,0.579548,4.639359,1.737099,72.516221,0.316329
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6,0.0
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1,0.0
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0,0.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1,0.0
max,98.0,4918.0,43.0,999.0,6.0,1.4,94.767,-26.9,5.045,5228.1,1.0


In [68]:
#Создадим массив целевой функции
df_target = data_frame_train['target']
#Удалим столбец с целевой фукцией из датафрейма:
data_frame_train = data_frame_train.drop(['target'],axis=1)

In [69]:
#Для преобразования категориальных признаков напишем специальную функцию:
def data_to_numbers (inlet_df):
    outlet_df = inlet_df
    #Удалим столбик "_id"
    outlet_df = outlet_df.drop(['_id'],axis=1)
    
    #Для остальных категориальных признаков применяется OneHotEncoding
    outlet_df = pd.get_dummies(outlet_df, columns=['contact','month','education','job','day_of_week','marital',
                                                   'loan','poutcome','default','housing'])
    return outlet_df

In [70]:
#Применим функцию преобразования
data_frame_train = data_to_numbers (data_frame_train)

In [71]:
 
data_frame_train

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,loan_yes,poutcome_failure,poutcome_nonexistent,poutcome_success,default_no,default_unknown,default_yes,housing_no,housing_unknown,housing_yes
0,54,87,1,999,0,1.4,93.444,-36.1,4.963,5228.1,...,0,0,1,0,0,1,0,1,0,0
1,36,291,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,1,0,1,0,0,1,0,0
2,53,182,1,999,0,1.4,93.444,-36.1,4.965,5228.1,...,0,0,1,0,1,0,0,0,0,1
3,34,180,2,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,1,0,1,0,0,0,0,1
4,29,6,1,999,0,-1.7,94.027,-38.3,0.890,4991.6,...,0,0,1,0,1,0,0,0,0,1
5,56,64,2,999,0,1.4,94.465,-41.8,4.961,5228.1,...,0,0,1,0,0,1,0,1,0,0
6,33,312,2,999,0,-1.8,93.876,-40.0,0.685,5008.7,...,1,0,1,0,1,0,0,0,0,1
7,36,20,1,999,1,-0.1,93.200,-42.0,4.120,5195.8,...,0,1,0,0,1,0,0,0,0,1
8,70,585,1,6,3,-1.1,94.601,-49.5,0.987,4963.6,...,0,0,0,1,1,0,0,1,0,0
9,32,196,2,999,1,-1.8,92.893,-46.2,1.327,5099.1,...,0,1,0,0,0,1,0,0,0,1


In [72]:
#Разделяем выборку на тренировочную и тестовую
data_train, data_test, target_train, target_test = train_test_split(data_frame_train, df_target, test_size=0.33, random_state=42)

In [73]:
#Рассмотрим три модели и обучим их с параметрами по умолчанию
#KNN
model_KNN = KNeighborsClassifier()
model_KNN.fit(data_train,target_train)
predict_KNN = model_KNN.predict(data_test)
predict_proba_KNN = model_KNN.predict_proba(data_test)

#DecisionTreeClassifier
model_DTC = DecisionTreeClassifier()
model_DTC.fit(data_train,target_train)
predict_DTC = model_DTC.predict(data_test)
predict_proba_DTC = model_DTC.predict_proba(data_test)

#LogisticRegression
model_LR = LogisticRegression()
model_LR.fit(data_train,target_train)
predict_LR = model_LR.predict(data_test)
predict_proba_LR = model_LR.predict_proba(data_test)

In [74]:
print('roc_auc:')
print('KNN',roc_auc_score(target_test, predict_proba_KNN[:,1]))
print('DTC',roc_auc_score(target_test, predict_proba_DTC[:,1]))
print('LR',roc_auc_score(target_test, predict_proba_LR[:,1]))


roc_auc:
KNN 0.8647871107622145
DTC 0.7257620275877538
LR 0.9323270158124929


In [75]:
#Лучший результат паказала модель LogisticRegression

In [76]:
#Подбираем гиперпараметры
param_grid = {
    'C': randint(1000,10000000),
    'warm_start':[True,False],
    'max_iter':randint(100,10000),
    'class_weight':[None, 'balanced'],
    'dual':[True,False],
    'verbose':randint(0,100)}

cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)

model_t = LogisticRegression(random_state=123)
random_search = RandomizedSearchCV(model_t, 
                                   param_distributions=param_grid, 
                                   n_iter=400, 
                                   n_jobs=-1,
                                   cv=cv, 
                                   scoring='roc_auc', 
                                   random_state=123)
random_search.fit(data_frame_train, df_target)

[LibLinear]

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
          error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=123, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=400, n_jobs=-1,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000000178C03C8>, 'warm_start': [True, False], 'max_iter': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000000178C0518>, 'class_weight': [None, 'balanced'], 'dual': [True, False], 'verbose': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000000178C0668>},
          pre_dispatch='2*n_jobs', random_state=123, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=0)

In [77]:
#Оптимальные параметры
random_search.best_params_

{'C': 4122804,
 'class_weight': 'balanced',
 'dual': False,
 'max_iter': 2217,
 'verbose': 10,
 'warm_start': False}

In [78]:
#Результат roc_auc
random_search.best_score_

0.9357942019453268

In [105]:
#LogisticRegression
bestModel = LogisticRegression(C=random_search.best_params_['C'],
                              class_weight=random_search.best_params_['class_weight'],
                              max_iter=random_search.best_params_['max_iter'],
                              n_jobs=-1,
                              tol=0.0000001,
                              solver='newton-cg',
                              warm_start=random_search.best_params_['warm_start'],
                              verbose=random_search.best_params_['verbose'],
                              dual=random_search.best_params_['dual'])
bestModel.fit(data_train,target_train)
predict_bm = bestModel.predict(data_test)
predict_proba_bm = bestModel.predict_proba(data_test)

[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   53.8s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   53.8s finished


In [156]:
print('roc_auc_best_model:')
print('LR',roc_auc_score(target_test, predict_proba_bm[:,1]))

roc_auc_best_model:
LR 0.9360744645059997


In [162]:
#Считываем тестовые данные
data_frame_test = pd.read_csv('test.csv')
result = pd.read_csv('sample_submission.csv',sep=',')

In [165]:
result = result.drop(['target'],axis=1)

In [166]:
X = data_to_numbers(data_frame_test)

In [167]:
print(X.shape)

(16476, 63)


In [168]:
predict = bestModel.predict(X)

In [169]:
data_frame_test

Unnamed: 0,_id,age,job,marital,education,default,housing,loan,contact,month,...,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,66810d8e6bf2b41c880a7bc6c8a1e295,57,technician,married,high.school,no,no,yes,cellular,may,...,371,1,999,1,failure,-1.8,92.893,-46.2,1.299,5099.1
1,ccac3879652b08cb8b44c1920fd93afa,55,unknown,married,unknown,unknown,yes,no,telephone,may,...,285,2,999,0,nonexistent,1.1,93.994,-36.4,4.860,5191.0
2,fcccab4d7a76f70647f015f2c84c2af8,33,blue-collar,married,basic.9y,no,no,no,cellular,may,...,52,1,999,1,failure,-1.8,92.893,-46.2,1.313,5099.1
3,ed8399278c30678dab739045fa12b440,36,admin.,married,high.school,no,no,no,telephone,jun,...,355,4,999,0,nonexistent,1.4,94.465,-41.8,4.967,5228.1
4,1d4d62ac5cabcb48bac7112813f290cb,27,housemaid,married,high.school,no,yes,no,cellular,jul,...,189,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1
5,aba2dec4c5cab88824f36babd24b986f,58,retired,married,professional.course,no,yes,yes,cellular,jul,...,605,1,999,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1
6,06f318f1dd178e738f675bb88a5adb84,48,services,married,high.school,unknown,yes,no,telephone,may,...,243,1,999,0,nonexistent,1.1,93.994,-36.4,4.856,5191.0
7,d5036f5956e42ee6207296238fc4bc1d,51,admin.,divorced,university.degree,unknown,yes,no,cellular,aug,...,24,7,999,0,nonexistent,1.4,93.444,-36.1,4.962,5228.1
8,fe0cc8933698ad4046ff2b82f65756eb,24,entrepreneur,married,university.degree,no,yes,yes,telephone,jun,...,126,4,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1
9,70190122f4ebf196535e11b33eb95b81,36,technician,divorced,professional.course,no,yes,yes,cellular,jul,...,43,4,999,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1


In [170]:
result['target'] = predict

In [171]:
result

Unnamed: 0,_id,target
0,66810d8e6bf2b41c880a7bc6c8a1e295,0
1,ccac3879652b08cb8b44c1920fd93afa,0
2,fcccab4d7a76f70647f015f2c84c2af8,0
3,ed8399278c30678dab739045fa12b440,0
4,1d4d62ac5cabcb48bac7112813f290cb,0
5,aba2dec4c5cab88824f36babd24b986f,1
6,06f318f1dd178e738f675bb88a5adb84,0
7,d5036f5956e42ee6207296238fc4bc1d,0
8,fe0cc8933698ad4046ff2b82f65756eb,0
9,70190122f4ebf196535e11b33eb95b81,0


In [172]:
#Записываем результат в файл
result.to_csv('result.csv',sep=',',index=None)