k-최근접 이웃의 다중 분류

In [57]:
import pandas as pd
import numpy as np

fish = pd.read_csv("https://bit.ly/fish_csv_data")
print(fish.head())

fish_input = fish[['Weight','Length','Diagonal','Height','Width']].to_numpy()
fish_target = fish[['Species']].to_numpy()


  Species  Weight  Length  Diagonal   Height   Width
0   Bream   242.0    25.4      30.0  11.5200  4.0200
1   Bream   290.0    26.3      31.2  12.4800  4.3056
2   Bream   340.0    26.5      31.1  12.3778  4.6961
3   Bream   363.0    29.0      33.5  12.7300  4.4555
4   Bream   430.0    29.0      34.0  12.4440  5.1340


In [58]:
# target의 고유값들 확인
print(np.unique(fish_target))

['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']


In [59]:
# reshape(-1,1) : x(입력)가 1차원일때 2차원 열벡터로 만들기
# -> 이미 2차원일경우 reshape 필요 x, 1차원일 경우만

# ravel() : y(타겟/정답)이 2차원일 때 1차원으로 평탄화
# ravel()은 y가 (n,1)인 상황에서만 (n,)으로 바꿔주는 것


In [60]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = \
    train_test_split(fish_input,fish_target,\
        stratify=fish_target,random_state=42)
    
# 다차원 인경우 reshape 필요없음
# train_input = train_input.reshape(-1,1)
# test_input = test_input.reshape(-1,1)

# KNN은 거리 기반이라 차원 많아지면 성능저하
# 따라서 poly 사용x
# poly = PolynomialFeatures(degree=5,include_bias=False)
# poly.fit(train_input)

train_poly = poly.transform(train_input)
test_poly = poly.transform(test_input)

# 스케일링
std = StandardScaler()
std.fit(train_input)

train_scaled = std.transform(train_input)
test_scaled = std.transform(test_input)



In [61]:
# x는 항상 2차원 y는 1차원이여야한다.

# KNN 모델 학습
from sklearn.neighbors import KNeighborsClassifier
from numpy import ravel

kn = KNeighborsClassifier(n_neighbors=3)
# ravel() 로 y값 1차원으로 변경
kn.fit(train_scaled,train_target)

print(kn.classes_)

print(kn.predict(test_scaled[:5]))
proba = kn.predict_proba(test_scaled[:5])
print(np.round(proba, decimals=4))


['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']
['Perch' 'Perch' 'Roach' 'Parkki' 'Parkki']
[[0.     0.     0.6667 0.     0.3333 0.     0.    ]
 [0.     0.     0.6667 0.     0.3333 0.     0.    ]
 [0.     0.     0.3333 0.     0.6667 0.     0.    ]
 [0.     1.     0.     0.     0.     0.     0.    ]
 [0.     0.6667 0.     0.     0.3333 0.     0.    ]]


  return self._fit(X, y)


# 로지스틱 회귀(이진 분류)

In [62]:
# -> 시그모이드 함수(tanh) 
# 0.5보다 크면 양성 클래스
# 0.5보다 작으면 음성 클래스

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=1, include_bias=False)
poly.fit(train_input)

train_poly = poly.transform(train_input)
test_poly = poly.transform(test_input)

# 스케일링
std = StandardScaler()
std.fit(train_poly)

train_scaled = std.transform(train_poly)
test_scaled = std.transform(test_poly)

train_target = train_target.ravel()

bream_smelt_indexes = (train_target == 'Bream') | (train_target == 'Smelt')
train_bream_smelt = train_scaled[bream_smelt_indexes]
target_bream_smelt = train_target[bream_smelt_indexes]


lr = LogisticRegression()
lr.fit(train_bream_smelt, target_bream_smelt)

print(lr.predict(train_bream_smelt[:5]))

print(lr.predict_proba(train_bream_smelt[:5]))


['Bream' 'Bream' 'Bream' 'Bream' 'Bream']
[[9.76263188e-01 2.37368125e-02]
 [9.99614480e-01 3.85519926e-04]
 [9.94438266e-01 5.56173414e-03]
 [9.09188993e-01 9.08110069e-02]
 [9.99361521e-01 6.38479331e-04]]


In [None]:
from scipy.special import expit
# 기울기와 절편
print(lr.coef_,lr.intercept_)

decisions = lr.decision_function(train_bream_smelt[:5])
print(decisions)

print(expit(decisions))

[[-0.4235112  -0.61604834 -0.70216369 -0.97498265 -0.7403996 ]] [-2.46732659]
[-3.7167051  -7.86053208 -5.18626807 -2.30377249 -7.35578257]
[0.02373681 0.00038552 0.00556173 0.09081101 0.00063848]


# 로지스틱 회귀(다중 분류)

In [None]:
# max_iter 기본값 100
# C 기본값 1 ,C는 규제 강도, 높을수록 규제 약해짐
lr = LogisticRegression(C=20, max_iter=1000)
lr.fit(train_scaled,train_target)

print(lr.score(train_scaled,train_target))
print(lr.score(test_scaled,test_target))

proba = lr.predict_proba(test_scaled[:5])
print(np.round(proba,decimals=3))

print(lr.coef_.shape,lr.intercept_.shape)

0.9243697478991597
0.975
[[0.    0.029 0.237 0.003 0.685 0.01  0.035]
 [0.    0.032 0.576 0.001 0.35  0.003 0.039]
 [0.    0.062 0.558 0.001 0.336 0.017 0.026]
 [0.003 0.93  0.001 0.    0.051 0.    0.015]
 [0.001 0.882 0.004 0.    0.094 0.002 0.017]]
(7, 5) (7,)
