In [2]:
import pandas as pd
fish = pd.read_csv('https://bit.ly/fish_csv_data')
fish.head()

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


In [3]:
print(pd.unique(fish['Species'])) # Species 열의 고윳값 추출
fish_input = fish[['Weight', 'Length', 'Diagonal', 'Height', 'Width']].to_numpy()
print(fish_input[:5])
fish_target = fish['Species'].to_numpy()

['Bream' 'Roach' 'Whitefish' 'Parkki' 'Perch' 'Pike' 'Smelt']
[[242.      25.4     30.      11.52     4.02  ]
 [290.      26.3     31.2     12.48     4.3056]
 [340.      26.5     31.1     12.3778   4.6961]
 [363.      29.      33.5     12.73     4.4555]
 [430.      29.      34.      12.444    5.134 ]]


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    fish_input, fish_target
)

In [5]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler() # 정규화
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

In [6]:
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(X_train_scaled, y_train)

다중 분류: 타깃 데이터에 2개 이상의 클래스가 포함됨.\
이때 타깃값을 그대로 sklearn model에 전달하면 순서가 알파벳순으로 바뀜.\
```predict_proba()``` 메서드로 클래스별 확률값 반환.

In [7]:
import numpy as np

proba = kn.predict_proba(X_test_scaled[:5])
print(np.round(proba, decimals=4))

[[0.     1.     0.     0.     0.     0.     0.    ]
 [0.     0.     0.6667 0.     0.3333 0.     0.    ]
 [0.     0.     1.     0.     0.     0.     0.    ]
 [0.     0.     0.6667 0.     0.3333 0.     0.    ]
 [0.     0.     0.     0.     0.     1.     0.    ]]


각각 Bream/Parkki/Perch/Pike/Roach/Smelt/Whitefish에 관한 확률

In [8]:
_, indexes = kn.kneighbors(X_test_scaled[3:4])
print(y_train[indexes])

[['Perch' 'Perch' 'Roach']]


In [9]:
char_arr = np.array(['A', 'B', 'C', 'D', 'E'])
print(char_arr)
print(char_arr[[True, False, True, False, False]]) # boolean indexing

['A' 'B' 'C' 'D' 'E']
['A' 'C']


In [10]:
bream_smelt_indexes = (y_train == 'Bream') | (y_train == 'Smelt')
train_bream_smelt = X_train_scaled[bream_smelt_indexes]
target_bream_smelt = y_train[bream_smelt_indexes]

In [11]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_bream_smelt, target_bream_smelt)
print(lr.predict(train_bream_smelt[:5])) # 농어 도미 농어 도미 도미
print(np.round(lr.predict_proba(train_bream_smelt[:5]), decimals=3)) # 앞이 도미bream(0), 뒤가 농어smelt(1)

['Smelt' 'Bream' 'Bream' 'Bream' 'Smelt']
[[0.032 0.968]
 [0.948 0.052]
 [0.999 0.001]
 [0.977 0.023]
 [0.021 0.979]]


In [13]:
decisions = lr.decision_function(train_bream_smelt[:5])
print(np.round(decisions, decimals=3))

from scipy.special import expit
print(np.round(expit(decisions), decimals=3)) # 양성 클래스에 대한 Z값 반환

[ 3.406 -2.912 -6.848 -3.734  3.824]
[0.968 0.052 0.001 0.023 0.979]


LogisticRegression은 릿지처럼 계수의 제곱을 규제한다(L2 규제)\
다만 매개변수로 C를 사용하고, 알파와 달리 작을수록 규제가 커진다.

In [15]:
lr = LogisticRegression(C=20, max_iter=1000)
lr.fit(X_train_scaled, y_train) # 7가지 모두 학습
print(lr.predict(X_test_scaled[:5]).reshape(-1, 1))
proba = lr.predict_proba(X_test_scaled[:5])
print(np.round(proba, decimals=3))
print(lr.classes_)

[['Parkki']
 ['Roach']
 ['Perch']
 ['Roach']
 ['Smelt']]
[[0.011 0.927 0.002 0.    0.048 0.    0.012]
 [0.002 0.01  0.307 0.02  0.634 0.001 0.026]
 [0.312 0.    0.362 0.191 0.033 0.    0.101]
 [0.001 0.014 0.284 0.004 0.643 0.    0.054]
 [0.    0.002 0.045 0.    0.008 0.945 0.   ]]
['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']


다중분류는 클래스마다 Z 값을 하나씩 계산함. 높은 Z 값을 출력하는 게 예측 클래스.\
확률은? 소프트맥스 함수를 써서 7개 Z 값을 확률로 변환.

In [18]:
decision = lr.decision_function(X_test_scaled[:5])
np.set_printoptions(suppress=True)
print(np.round(decision, decimals=3))

[[  1.036   5.451  -0.7    -3.7     2.497  -5.656   1.073]
 [ -2.36   -0.73    2.702  -0.009   3.425  -3.262   0.235]
 [  4.208  -3.76    4.356   3.72    1.962 -13.57    3.084]
 [ -2.275   0.023   3.033  -1.338   3.852  -4.663   1.369]
 [-11.668   1.913   5.07   -3.259   3.382   8.11   -3.547]]


In [19]:
from scipy.special import softmax
proba = softmax(decision, axis=1) # axis=1 : 각 행. 즉 샘플에 대해 계싼.
print(np.round(proba, decimals=3))

[[0.011 0.927 0.002 0.    0.048 0.    0.012]
 [0.002 0.01  0.307 0.02  0.634 0.001 0.026]
 [0.312 0.    0.362 0.191 0.033 0.    0.101]
 [0.001 0.014 0.284 0.004 0.643 0.    0.054]
 [0.    0.002 0.045 0.    0.008 0.945 0.   ]]
