# ３種類のワイン分類


説明変数（特徴量）は13個

Alcohol<br>
Malic acid<br>
Ash<br>
Alcalinity of ash<br>
Magnesium<br>
Total phenols<br>
Flavanoids<br>
Nonflavanoid phenols<br>
Proanthocyanins<br>
Color intensity<br>
Hue<br>
OD280/OD315 of diluted wines<br>
Proline

## 全体の流れ

#### データの読み込み
#### 問題1 データの確認（何件ワインのデータがあるか。）
#### 問題2 相関行列の可視化
#### 問題3 データの分割
#### 問題4 標準化
#### 問題5 モデル作成
#### 問題6 テストデータで正解率の算出

# データの読み込み

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# warning非表示
import warnings
# すべての警告を表示させない設定
warnings.simplefilter('ignore')

In [2]:
from sklearn.datasets import load_wine
wine = load_wine()
df = pd.DataFrame(wine.data , columns=wine.feature_names)

target = pd.DataFrame(wine.target, columns=['target'])

df = pd.concat([df, target], axis=1)
df.head(3)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0


# 【問題1 データの確認（何件ワインのデータがあるか。）】

In [3]:
 dir(wine)

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']

In [4]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [7]:
    print(wine.feature_names)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [8]:
wine.data.shape

(178, 13)

In [12]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

# 【問題2 相関行列の出力】

ワインの種類を分類するにあたり、どの特徴量が重要なのか

dfに対して、相関係数を出力する関数を利用しましょう。

今回は
X = df[["proline", 'color_intensity']] を利用

In [9]:
X = df[["proline", 'color_intensity']]

In [10]:
y = df.target.values

In [11]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

# 【問題3 データの分割】

In [13]:
x_train, x_test, y_train, y_test = train_test_split(wine.data, y,  test_size = 0.25, random_state=5)

In [14]:
x_train.shape

(133, 13)

In [15]:
x_test.shape

(45, 13)

In [17]:
y_test.shape

(45,)

In [50]:
print(y_train)

[1 7 9 0 7]


# 【問題4 標準化】

In [49]:
# 標準化
sc = StandardScaler()
a = x_train, y_train

# トレーニングデータで用いたscを元にtransform
a = np.random.randint(10, size=(2,5))
X_std = sc.fit_transform(a)
print("平均", X_std.mean())
print("標準偏差", X_std.std())

平均 0.0
標準偏差 0.8944271909999159


# 【問題5 モデル作成】

In [58]:
# ロジスティック回帰でモデル作成

model = LogisticRegression()
model.fit(x_train.reshape(-1, 1), y_train.reshape(-1, 1))

LogisticRegression()

# 【問題6 テストデータで正解率の算出】

In [63]:
# テストデータでモデルの正解率の算出
len(x_test)
pred = model.predict(x_test.reshape(-1, 1))
pred

array([0, 7, 9, 0, 0, 9, 9, 7, 7, 1, 7, 9, 0, 0, 7, 7, 0, 0, 9, 9, 7, 7,
       1, 7, 9, 0, 0, 7, 7, 0, 0, 9, 9, 7, 7, 0, 7, 9, 0, 0, 7, 9, 0, 0,
       9, 9, 7, 7, 1, 7, 7, 0, 0, 9, 7, 0, 0, 7, 7, 7, 7, 1, 7, 7, 0, 0,
       9, 7, 0, 0, 7, 7, 7, 7, 0, 7, 7, 0, 0, 7, 7, 0, 0, 9, 9, 7, 7, 1,
       7, 9, 0, 0, 7, 7, 0, 0, 9, 9, 7, 7, 1, 7, 9, 0, 0, 7, 7, 0, 0, 7,
       7, 7, 9, 7, 7, 9, 0, 0, 7, 9, 0, 0, 9, 9, 7, 7, 9, 7, 9, 0, 0, 1,
       7, 0, 0, 9, 9, 7, 7, 1, 7, 9, 0, 0, 7, 7, 0, 0, 9, 9, 7, 7, 1, 7,
       9, 0, 0, 9, 7, 0, 0, 7, 7, 7, 7, 1, 7, 7, 0, 0, 7, 7, 0, 0, 7, 7,
       7, 7, 7, 7, 9, 0, 0, 9, 9, 0, 0, 7, 7, 7, 7, 1, 7, 7, 0, 0, 7, 9,
       0, 0, 9, 9, 7, 7, 1, 7, 9, 0, 0, 7, 7, 0, 0, 9, 9, 7, 7, 1, 7, 9,
       0, 0, 1, 7, 0, 0, 9, 9, 7, 7, 1, 7, 9, 0, 0, 7, 7, 0, 0, 7, 7, 7,
       7, 9, 7, 9, 0, 0, 9, 9, 0, 0, 7, 7, 7, 7, 1, 7, 9, 0, 0, 7, 7, 0,
       0, 9, 9, 7, 7, 1, 7, 9, 0, 0, 7, 9, 0, 0, 9, 9, 7, 7, 1, 7, 9, 0,
       0, 7, 7, 0, 0, 9, 7, 7, 7, 9, 7, 9, 0, 0, 7,

In [64]:
!pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.18.0-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 3.5 MB/s eta 0:00:01
Installing collected packages: mlxtend
Successfully installed mlxtend-0.18.0


In [65]:
from mlxtend.plotting import plot_decision_regions

In [10]:
# 訓練データの可視化
# plt.figure(figsize= (8, 4)) # サイズの調整
# plot_decision_regions(X_train_std, y_train, model)

In [11]:
# テストデータの可視化
# plt.figure(figsize= (8, 4)) # サイズの調整
# plot_decision_regions(X_test_std, y_test, model)