In [37]:
# lib
import numpy as np
import pandas as pd
import scipy as sp

# for plot
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

import statsmodels.formula.api as smf
import statsmodels.api as sm

# 多層
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler

%precision 3

%matplotlib inline

In [4]:

iris = load_iris()


In [5]:

iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [6]:

iris.target_names


array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [10]:

X = iris.data[50:150, 0:2]
y = iris.target[50:150]

print(X.shape)
print(y.shape)

(100, 2)
(100,)


In [16]:

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

print(X_train.shape)
print(y_train.shape)

(75, 2)
(75,)


In [17]:

y_train [0:10]


array([1, 1, 2, 2, 2, 2, 1, 1, 1, 1])

In [19]:

X_train_df = pd.DataFrame(
    X_train, columns=["sepal_len", "sepal_wid"])

y_train_df = pd.DataFrame({"species": y_train - 1})

iris_train_df = pd.concat([y_train_df, X_train_df], axis=1)

print(iris_train_df.head())

   species  sepal_len  sepal_wid
0        0        5.7        2.8
1        0        6.6        3.0
2        1        6.1        3.0
3        1        6.7        3.3
4        1        6.8        3.0


In [20]:

# モデル化
# すべての変数を入れたモデル
logi_mod_full = smf.glm(
    "species ~ sepal_len + sepal_wid", data = iris_train_df,
    family=sm.families.Binomial()).fit()

# 長さのみ
logi_mod_len = smf.glm(
    "species ~ sepal_len", data = iris_train_df,
    family=sm.families.Binomial()).fit()

# 幅のみ
logi_mod_wid = smf.glm(
    "species ~ sepal_wid", data = iris_train_df,
    family=sm.families.Binomial()).fit()

# Nullモデル
logi_mod_null = smf.glm(
    "species ~ 1", data = iris_train_df,
    family=sm.families.Binomial()).fit()

# AICの比較
print("full", logi_mod_full.aic.round(3))
print("len ", logi_mod_len.aic.round(3))
print("wid ", logi_mod_wid.aic.round(3))
print("null", logi_mod_null.aic.round(3))

full 76.813
len  76.234
wid  92.768
null 105.318


In [25]:

logi_mod_len.summary()

0,1,2,3
Dep. Variable:,species,No. Observations:,75.0
Model:,GLM,Df Residuals:,73.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-36.117
Date:,"Sat, 02 Jun 2018",Deviance:,72.234
Time:,10:04:26,Pearson chi2:,63.8
No. Iterations:,5,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-16.4152,4.000,-4.104,0.000,-24.256,-8.575
sepal_len,2.6478,0.639,4.142,0.000,1.395,3.901


In [26]:

logi_mod_len.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-16.4152,4.000,-4.104,0.000,-24.256,-8.575
sepal_len,2.6478,0.639,4.142,0.000,1.395,3.901


In [30]:
# 予測精度

X_test_df = pd.DataFrame(
    X_test, columns = ["sepal_len", "sepal_wid"])

# 当てはめと予測
logi_fit = logi_mod_len.fittedvalues.round(0)
logi_pred = logi_mod_len.predict(X_test_df).round(0)

# 正答数
true_train = sp.sum(logi_fit == (y_train - 1))
true_test = sp.sum(logi_pred == (y_test - 1))

# 的中率
result_train = true_train / len(y_train)
result_test = true_test / len(y_test)

#結果の出力
print("訓練データの的中率　：", result_train.round(2))
print("テストデータの的中率：", result_test)


訓練データの的中率　： 0.75
テストデータの的中率： 0.68


### NN

##### 標準化

In [33]:

scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [35]:

sp.std(X_train_scaled, axis=0)

array([1., 1.])

In [36]:

sp.std(X_test_scaled, axis=0)

array([0.74 , 0.679])

In [48]:

nnet = MLPClassifier(
    hidden_layer_sizes=(100,100),
    alpha=0.07,
    max_iter=10000,
    random_state=0)

nnet.fit(X_train_scaled, y_train)

print(nnet.score(X_train_scaled, y_train))
print(nnet.score(X_test_scaled, y_test))

0.8933333333333333
0.68
