In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,8)

from ipywidgets import interact, IntSlider, FloatSlider

In [2]:
df_train = pd.read_csv('kaggle1/train.csv', sep=',', encoding='utf8')

In [3]:
df_train.head().T

Unnamed: 0,0,1,2,3,4
_id,df7489733b004bbe40d3d37b34f82419,905a0b9a5456ee962223033473666be3,02d69c07ec6d68e31d641edd45ce61cd,e492fbe0cccaf67cdb5c0944dbc8a167,ac2dc6f5835bdea8e71dd99b65208d07
age,54,36,53,34,29
job,technician,services,blue-collar,blue-collar,services
marital,married,single,married,married,single
education,professional.course,basic.4y,basic.4y,basic.9y,university.degree
default,unknown,no,no,no,no
housing,no,no,yes,yes,yes
loan,no,no,no,yes,no
contact,cellular,telephone,cellular,telephone,telephone
month,aug,may,aug,may,aug


In [4]:
df_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24712 entries, 0 to 24711
Data columns (total 22 columns):
_id               24712 non-null object
age               24712 non-null int64
job               24712 non-null object
marital           24712 non-null object
education         24712 non-null object
default           24712 non-null object
housing           24712 non-null object
loan              24712 non-null object
contact           24712 non-null object
month             24712 non-null object
day_of_week       24712 non-null object
duration          24712 non-null int64
campaign          24712 non-null int64
pdays             24712 non-null int64
previous          24712 non-null int64
poutcome          24712 non-null object
emp.var.rate      24712 non-null float64
cons.price.idx    24712 non-null float64
cons.conf.idx     24712 non-null float64
euribor3m         24712 non-null float64
nr.employed       24712 non-null float64
target            24712 non-null int64
dtypes: floa

In [5]:
#Удаляем идентификатор _id
df_train = df_train.drop(['_id'], axis=1)

In [6]:
# Ищем столбцы с категориальными признаками и кодируем их с помощью OneHotEncoding
df_train = pd.get_dummies(df_train, columns=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome'])
   
    

In [7]:
# отделяем целевую переменную
y = df_train['target']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
train_test_split(df_train, y, test_size=0.33, random_state=1)

In [8]:
# Удаляем таргет
X_train = X_train.drop(['target'], axis=1)
X_test = X_test.drop(['target'], axis=1)

In [30]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import GridSearchCV

In [479]:
# Инициализируем модели:

# КНН
model = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=7))
])

# Дерево принятия решений
tree = DecisionTreeClassifier(max_depth=4, max_features=44)

#Логическая регрессия
logreg = LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=500, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=1e-10,
          verbose=0, warm_start=False)

In [480]:
#обучение КНН
model.fit(X_train, y_train)
#обучения дерева принятия решений
tree.fit(X_train, y_train)
#обучение логической регрессии
logreg.fit(X_train, y_train)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=500, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=1e-10,
          verbose=0, warm_start=False)

In [12]:
#Расчёт лучших параметров для дерева принятия решений
from sklearn.model_selection import GridSearchCV, cross_val_score
tree_params = {'max_depth': range(1,50),
'max_features': range(4,50)}
tree_grid = GridSearchCV(tree, tree_params,cv=10, n_jobs=-1,verbose=True)
tree_grid.fit(X_train, y_train)
tree_grid.best_params_, tree_grid.best_score_

In [14]:
#Расчёт лучших параметров для KNN
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_jobs=-1))])
knn_params = {'knn__n_neighbors': range(1, 20)}
knn_grid = GridSearchCV(knn_pipe, knn_params,cv=5, n_jobs=-1,verbose=True)
knn_grid.fit(X_train, y_train)
knn_grid.best_params_, knn_grid.best_score_

In [None]:
#РАСЧЕТ ПРЕДСКАЗАНИЙ

In [743]:
y_knn = model.predict_proba(X_test, )

In [721]:
y_tree = tree.predict_proba(X_test, )

In [501]:
y_logreg = logreg.predict_proba(X_test, )

array([[0.72534726, 0.27465274],
       [0.98442598, 0.01557402],
       [0.98088769, 0.01911231],
       ...,
       [0.3040969 , 0.6959031 ],
       [0.94065215, 0.05934785],
       [0.99122615, 0.00877385]])

In [744]:
y_knn.shape

(8155, 2)

In [18]:
from sklearn.metrics import roc_auc_score

In [19]:
#roc_auc для KNN
roc_auc_score(y_test, y_knn[:, 1])

0.8122196451534013

In [399]:
#roc_auc для дерева принятия решений
roc_auc_score(y_test, y_tree[:, 1])

0.9133762494181874

In [650]:
#roc_auc для метода логической регрессии
roc_auc_score(y_test, y_logreg[:, 1])

0.9353623805804716

In [745]:
#####################################

По результатам ROG AUC можно сделать вывод что модель Логической регрессии даёт наиболее точные результаты в предсказании, и 
для предсказания по тестовой выборке будет использоваться именно она


######################################

In [None]:
#грузим тестовую выборку без таргета и делаем для неё предсказания

In [560]:
df_test = pd.read_csv('kaggle1/test.csv', sep=',', encoding='utf8')

In [561]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16476 entries, 0 to 16475
Data columns (total 21 columns):
_id               16476 non-null object
age               16476 non-null int64
job               16476 non-null object
marital           16476 non-null object
education         16476 non-null object
default           16476 non-null object
housing           16476 non-null object
loan              16476 non-null object
contact           16476 non-null object
month             16476 non-null object
day_of_week       16476 non-null object
duration          16476 non-null int64
campaign          16476 non-null int64
pdays             16476 non-null int64
previous          16476 non-null int64
poutcome          16476 non-null object
emp.var.rate      16476 non-null float64
cons.price.idx    16476 non-null float64
cons.conf.idx     16476 non-null float64
euribor3m         16476 non-null float64
nr.employed       16476 non-null float64
dtypes: float64(5), int64(5), object(11)
memory usa

In [563]:
#cохраним столбец с идентификатором _id отдельно
df_id = df_test.drop(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month' , 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'], axis=1)

In [580]:
df_id.shape


(16476, 1)

In [729]:
# Ищем столбцы с категориальными признаками и кодируем их с помощью OneHotEncoding
df_test = pd.get_dummies(df_test, columns=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome'])

In [486]:
df_test.shape

(16476, 64)

In [732]:
#столбец с идентификатором _id удалим его из выборки для предсказания

df_test_prec = df_test
df_test_prec = df_test_prec.drop(['_id'], axis=1)

df_test.shape


(16476, 64)

In [733]:
df_test_prec

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,57,371,1,999,1,-1.8,92.893,-46.2,1.299,5099.1,...,0,0,0,1,0,0,0,1,0,0
1,55,285,2,999,0,1.1,93.994,-36.4,4.860,5191.0,...,0,0,0,0,1,0,0,0,1,0
2,33,52,1,999,1,-1.8,92.893,-46.2,1.313,5099.1,...,0,0,1,0,0,0,0,1,0,0
3,36,355,4,999,0,1.4,94.465,-41.8,4.967,5228.1,...,0,0,1,0,0,0,0,0,1,0
4,27,189,2,999,0,1.4,93.918,-42.7,4.963,5228.1,...,0,0,1,0,0,0,0,0,1,0
5,58,605,1,999,0,1.4,93.918,-42.7,4.962,5228.1,...,0,0,1,0,0,0,0,0,1,0
6,48,243,1,999,0,1.1,93.994,-36.4,4.856,5191.0,...,0,0,0,0,0,0,1,0,1,0
7,51,24,7,999,0,1.4,93.444,-36.1,4.962,5228.1,...,0,0,0,0,1,0,0,0,1,0
8,24,126,4,999,0,1.4,94.465,-41.8,4.962,5228.1,...,0,0,0,0,0,0,1,0,1,0
9,36,43,4,999,0,1.4,93.918,-42.7,4.962,5228.1,...,0,0,0,1,0,0,0,0,1,0


In [735]:
#Делаем предсказание для выборки test
X_predict = logreg.predict(df_test_prec)

In [736]:
#Запишем массив с предсказание в датафрейм
df_predict = pd.DataFrame(X_predict)


In [737]:
# Вызываем из датафрейма столбец типа Series
df_pr=df_predict[0]


In [738]:
type(df_pr)

pandas.core.series.Series

In [739]:
# Объединяем таргет и ID в датафрейм
df_final=pd.concat([df_id, df_pr], axis=1)

In [740]:
type(df_final)

pandas.core.frame.DataFrame

In [741]:
# Правильно называем колонки
df_final.columns = ['_id', 'target']

In [742]:
# Записываем в файл
df_final.to_csv('test_final.csv',index=False)