In [1]:
import pandas as pd
fish = pd.read_csv('https://bit.ly/fish_csv_data')
fish.head()

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


In [2]:
# 생선 종류 출력
print(pd.unique(fish['Species']))

['Bream' 'Roach' 'Whitefish' 'Parkki' 'Perch' 'Pike' 'Smelt']


In [3]:
# Species 열을 제외한 나머지 열을 입력 데이터로 사용
fish_input = fish[['Weight', 'Length', 'Diagonal', 'Height', 'Width']].to_numpy()
fish_target = fish['Species'].to_numpy()

In [4]:
# 훈련 세트와 테스트 세트로 분리
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(fish_input, fish_target)

In [5]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)

In [6]:
# KNN classifier를 이용한 모델 훈련
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(train_scaled, train_target)
print(kn.score(train_scaled, train_target))
print(kn.score(test_scaled, test_target))

0.907563025210084
0.75


In [7]:
import numpy as np
proba = kn.predict_proba(test_scaled[:5])
print(np.round(proba, decimals=4))

[[0.     0.     0.     0.     0.     1.     0.    ]
 [0.     0.     1.     0.     0.     0.     0.    ]
 [0.     0.6667 0.3333 0.     0.     0.     0.    ]
 [0.     0.     1.     0.     0.     0.     0.    ]
 [0.     0.     0.     0.     0.     1.     0.    ]]


In [8]:
# 로지스틱 회귀로 이진 분류 수행


In [9]:
# boolean indexing을 통한 도미와 빙어 행 선택
bream_smelt_indexes = (train_target == 'Bream') | (train_target == 'Smelt')
train_bream_smelt = train_scaled[bream_smelt_indexes]
target_bream_smelt = train_target[bream_smelt_indexes]

In [10]:
# 로지스틱 회귀 모델 훈련
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_bream_smelt, target_bream_smelt)

In [11]:
# 예측
print(lr.predict(train_bream_smelt[:5]))

['Bream' 'Bream' 'Bream' 'Bream' 'Bream']


In [12]:
# 예측 확률
# 첫 번째 열이 음성 클래스(0)에 대한 확률, 두 번째 열이 양성 클래스(1)에 대한 확률
print(lr.predict_proba(train_bream_smelt[:5]))

[[0.91749036 0.08250964]
 [0.98885563 0.01114437]
 [0.99623216 0.00376784]
 [0.99782755 0.00217245]
 [0.99829345 0.00170655]]


In [13]:
print(lr.coef_, lr.intercept_)

[[-0.44456166 -0.61340716 -0.69739915 -0.95036884 -0.74476487]] [-2.7365316]


In [14]:
decisions = lr.decision_function(train_bream_smelt[:5])
print(decisions)

[-2.40872689 -4.48561382 -5.57747946 -6.12972313 -6.37157332]


In [15]:
# 시그모이드 함수로 확률 계산
from scipy.special import expit
print(expit(decisions))

[0.08250964 0.01114437 0.00376784 0.00217245 0.00170655]


In [16]:
# 로지스틱 회귀로 다중 분류 수행

In [18]:
lr = LogisticRegression(C=20, max_iter=1000) # 규제 완화를 위해 C 지정, 반복 횟수를 늘리기 위해 max_iter 조정
lr.fit(train_scaled, train_target) # 모든 종류의 생선이 포함된 데이터를 사용
print(lr.score(train_scaled, train_target))
print(lr.score(test_scaled, test_target))

0.9243697478991597
0.875


In [19]:
# 예측
print(lr.predict(test_scaled[:5]))

['Smelt' 'Roach' 'Parkki' 'Perch' 'Smelt']


In [20]:
proba = lr.predict_proba(test_scaled[:5])
print(np.round(proba, decimals=3))

[[0.    0.001 0.029 0.    0.003 0.966 0.   ]
 [0.    0.026 0.428 0.002 0.523 0.001 0.02 ]
 [0.009 0.739 0.02  0.    0.204 0.    0.027]
 [0.    0.001 0.897 0.004 0.083 0.    0.015]
 [0.    0.002 0.025 0.    0.004 0.969 0.   ]]


In [21]:
# z값 계산
decision = lr.decision_function(test_scaled[:5])
print(np.round(decision, decimals=2))

[[-11.61   1.26   4.7   -2.06   2.45   8.19  -2.93]
 [ -4.32   0.74   3.55  -1.6    3.75  -2.59   0.47]
 [ -0.12   4.24   0.63  -3.42   2.95  -5.23   0.94]
 [ -2.58  -1.5    5.94   0.52   3.56  -7.78   1.83]
 [-11.35   1.64   4.37  -2.53   2.58   8.02  -2.73]]


In [23]:
# softmax 함수를 통해 0과 1 사이의 확률로 변환
from scipy.special import softmax
proba = softmax(decision, axis=1)
print(np.round(proba, decimals=3))

[[0.    0.001 0.029 0.    0.003 0.966 0.   ]
 [0.    0.026 0.428 0.002 0.523 0.001 0.02 ]
 [0.009 0.739 0.02  0.    0.204 0.    0.027]
 [0.    0.001 0.897 0.004 0.083 0.    0.015]
 [0.    0.002 0.025 0.    0.004 0.969 0.   ]]
