## 機械学習フロー

In [69]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

## 【問題1】クロスバリデーション

In [70]:
df = pd.read_csv("application_train.csv")

In [71]:
df_s = np.abs(df.corr().loc[: , "TARGET"])
df_s.head()

SK_ID_CURR          0.002108
TARGET              1.000000
CNT_CHILDREN        0.019187
AMT_INCOME_TOTAL    0.003982
AMT_CREDIT          0.030369
Name: TARGET, dtype: float64

In [72]:
df_s.sort_values(ascending = False).head(11)

TARGET                         1.000000
EXT_SOURCE_3                   0.178919
EXT_SOURCE_2                   0.160472
EXT_SOURCE_1                   0.155317
DAYS_BIRTH                     0.078239
REGION_RATING_CLIENT_W_CITY    0.060893
REGION_RATING_CLIENT           0.058899
DAYS_LAST_PHONE_CHANGE         0.055218
DAYS_ID_PUBLISH                0.051457
REG_CITY_NOT_WORK_CITY         0.050994
FLAG_EMP_PHONE                 0.045982
Name: TARGET, dtype: float64

In [78]:
y = df.loc[: , "TARGET"]
x0 = df.loc[: , ["EXT_SOURCE_1", "EXT_SOURCE_2","EXT_SOURCE_3" , "DAYS_BIRTH"]]
X1 = pd.concat([x0 , y] , axis = 1)
X1 = X1.dropna(how = "any")
X = X1.iloc[: , :4].values
y = X1.iloc[: , 4].values

In [74]:
#クロスバリデーション
from sklearn.model_selection import KFold
kf_5 = KFold(n_splits = 5 , random_state=None, shuffle=False)
kf_5.get_n_splits(X)
for train_index , test_index in kf_5.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train , X_test = X[train_index] , X[test_index]
    y_train , y_test = y[train_index] , y[test_index]

TRAIN: [ 21918  21919  21920 ... 109586 109587 109588] TEST: [    0     1     2 ... 21915 21916 21917]
TRAIN: [     0      1      2 ... 109586 109587 109588] TEST: [21918 21919 21920 ... 43833 43834 43835]
TRAIN: [     0      1      2 ... 109586 109587 109588] TEST: [43836 43837 43838 ... 65751 65752 65753]
TRAIN: [     0      1      2 ... 109586 109587 109588] TEST: [65754 65755 65756 ... 87669 87670 87671]
TRAIN: [    0     1     2 ... 87669 87670 87671] TEST: [ 87672  87673  87674 ... 109586 109587 109588]


In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix
lr = LogisticRegression()
lr.fit(X_train , y_train)
y_pred = lr.predict(X_test)
print(lr.score(X_test , y_test))
print(roc_auc_score(y_test , y_pred))

0.9278642149929278
0.5


In [54]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

knc = KNeighborsClassifier()
lr = LogisticRegression()
svc = SVC()
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()

In [55]:
from sklearn.model_selection import cross_val_score

models = []
models.append(("最近傍法" , knc))
models.append(("ロジスティック回帰" , lr))
#models.append(("SVM" , svc))
models.append(("決定木" , tree))
models.append(("ランダムフォレスト" ,  forest))

results = []
names = []
for name , model in models:
    result = cross_val_score(model , X_train , y_train , cv = kf_5  , scoring = "roc_auc")
    names.append(name)
    results.append(result)
    
for i in range(len(names)):
    print(names[i],results[i].mean())

最近傍法 0.535054784420565
ロジスティック回帰 0.5766310206465302
決定木 0.53736944429676
ランダムフォレスト 0.6376154945882366


## 【問題2】グリッドサーチ

In [56]:
from sklearn.model_selection import GridSearchCV

LogisticRegression(penalty="L1" or 'L2')
parameters = {"C" : [0.1 , 1 , 10 , 100]}

clf = GridSearchCV(estimator = lr , param_grid = parameters , cv = 5 , scoring = "roc_auc")
clf.fit(X , y)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10, 100]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='roc_auc', verbose=0)

In [57]:
clf.best_score_

0.57462774829578

In [58]:
clf.best_params_

{'C': 0.1}

## 【問題3】Kernelからの調査

In [30]:
#KaggleのKernelから様々なアイデアを見つけ出して、列挙してください。