<a href="https://colab.research.google.com/github/hojin99/aitest/blob/main/%EB%A1%9C%EC%A7%80%EC%8A%A4%ED%8B%B1%ED%9A%8C%EA%B7%80.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 데이터 준비

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

fish = pd.read_csv('https://bit.ly/fish_csv_data')
print(fish.head())
print('-------------------')
print(fish.describe())
print('-------------------')
print(pd.unique(fish['Species']))

fish_input = fish[['Weight','Length','Diagonal','Height','Width']].to_numpy()
fish_target = fish['Species'].to_numpy()
print('-------------------')
print(fish_input[:5])

# 훈련, 테스트 데이터 나누기
train_input, test_input, train_target, test_target = train_test_split(fish_input, fish_target, random_state=42)

#  표준화
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)


  Species  Weight  Length  Diagonal   Height   Width
0   Bream   242.0    25.4      30.0  11.5200  4.0200
1   Bream   290.0    26.3      31.2  12.4800  4.3056
2   Bream   340.0    26.5      31.1  12.3778  4.6961
3   Bream   363.0    29.0      33.5  12.7300  4.4555
4   Bream   430.0    29.0      34.0  12.4440  5.1340
-------------------
            Weight      Length    Diagonal      Height       Width
count   159.000000  159.000000  159.000000  159.000000  159.000000
mean    398.326415   28.415723   31.227044    8.970994    4.417486
std     357.978317   10.716328   11.610246    4.286208    1.685804
min       0.000000    8.400000    8.800000    1.728400    1.047600
25%     120.000000   21.000000   23.150000    5.944800    3.385650
50%     273.000000   27.300000   29.400000    7.786000    4.248500
75%     650.000000   35.500000   39.650000   12.365900    5.584500
max    1650.000000   63.400000   68.000000   18.957000    8.142000
-------------------
['Bream' 'Roach' 'Whitefish' 'Parkki' '

### K최근접 분류 (이진 분류)

In [4]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(train_scaled, train_target)

# 각 클래스의 순서 값 (0-Bream...)
print(kn.classes_)
# 분류 결과
print(kn.predict(test_scaled[:5]))

# 각 클래스의 확률값을 리턴
# 확률값이 상세하게 나오지 못함 (이웃의 비율을 이용하기 때문에)
proba = kn.predict_proba(test_scaled[:5])
print(np.round(proba, decimals=4))

['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']
['Perch' 'Smelt' 'Pike' 'Perch' 'Perch']
[[0.     0.     1.     0.     0.     0.     0.    ]
 [0.     0.     0.     0.     0.     1.     0.    ]
 [0.     0.     0.     1.     0.     0.     0.    ]
 [0.     0.     0.6667 0.     0.3333 0.     0.    ]
 [0.     0.     0.6667 0.     0.3333 0.     0.    ]]


### Logistic Regression (이진 분류)

In [30]:
from sklearn.linear_model import LogisticRegression

## bream, smelt인 값을 가지는 배열 생성
# numpy 논리 인덱싱을 이용 - ndarray에 논리연산자를 적용하면 true/false 논리 벡터가 리턴됨
bream_smelt_indexes = (train_target == 'Bream') | (train_target == 'Smelt')

# 논리 배열은 다른 ndarray의 인덱스 필터로 사용 될 수 있음
# print(bream_smelt_indexes.shape)
# print(bream_smelt_indexes)

train_bream_smelt = train_scaled[bream_smelt_indexes]
target_bream_smelt = train_target[bream_smelt_indexes]

# 훈련
lr = LogisticRegression()
lr.fit(train_bream_smelt, target_bream_smelt)

# 예측 (알파벳 순서에 따라서 0-Bream, 1-Smelt)
print('----------------')
print(lr.predict(train_bream_smelt[:5]))
print(lr.predict_proba(train_bream_smelt[:5]))

# sigmoid 값(예측값) - 양성에 대한 예측값
print('----------------')
from scipy.special import expit
decisions = lr.decision_function(train_bream_smelt[:5])
# coef - 가중치, intercept - bias
print(lr.coef_, lr.intercept_)
print(decisions)
print(expit(decisions))


----------------
['Bream' 'Smelt' 'Bream' 'Bream' 'Bream']
[[0.99759855 0.00240145]
 [0.02735183 0.97264817]
 [0.99486072 0.00513928]
 [0.98584202 0.01415798]
 [0.99767269 0.00232731]]
----------------
[[-0.4037798  -0.57620209 -0.66280298 -1.01290277 -0.73168947]] [-2.16155132]
[-6.02927744  3.57123907 -5.26568906 -4.24321775 -6.0607117 ]
[0.00240145 0.97264817 0.00513928 0.01415798 0.00232731]


### Logistic Regression (다중 분류)

In [29]:
# max_iter - 반복횟수 (기본값은 100)
# C - 규제의 강도 (기본값 - 1, 값이 올라가면 규제가 약해짐, 내려가면 규제가 강해짐)
lr = LogisticRegression(C=20, max_iter=1000)

lr.fit(train_scaled, train_target)

print(lr.score(train_scaled, train_target))
print(lr.score(test_scaled, test_target))

proba = lr.predict_proba(test_scaled[:5])
print(np.round(proba, decimals=3))

# 클래스 별 선형함수가 만들어 짐 - 7개
# 각 클래스 별 선형함수는 이진분류를 수행 (One vs Rest, One vs All)
# 각 클래스 별 확률값을 더해서 1을 만들기 위해서 Softmax 함수 이용
print(lr.coef_.shape, lr.intercept_.shape)

print('----------------')
decision = lr.decision_function(test_scaled[:5])
print(np.round(decision, decimals=2))

from scipy.special import softmax
proba = softmax(decision, axis=1)
print(np.round(proba, decimals=3))

0.9327731092436975
0.925
[[0.    0.014 0.841 0.    0.136 0.007 0.003]
 [0.    0.003 0.044 0.    0.007 0.946 0.   ]
 [0.    0.    0.034 0.935 0.015 0.016 0.   ]
 [0.011 0.034 0.306 0.007 0.567 0.    0.076]
 [0.    0.    0.904 0.002 0.089 0.002 0.001]]
(7, 5) (7,)
----------------
[[ -6.5    1.03   5.16  -2.73   3.34   0.33  -0.63]
 [-10.86   1.93   4.77  -2.4    2.98   7.84  -4.26]
 [ -4.34  -6.23   3.17   6.49   2.36   2.42  -3.87]
 [ -0.68   0.45   2.65  -1.19   3.26  -5.75   1.26]
 [ -6.4   -1.99   5.82  -0.11   3.5   -0.11  -0.71]]
[[0.    0.014 0.841 0.    0.136 0.007 0.003]
 [0.    0.003 0.044 0.    0.007 0.946 0.   ]
 [0.    0.    0.034 0.935 0.015 0.016 0.   ]
 [0.011 0.034 0.306 0.007 0.567 0.    0.076]
 [0.    0.    0.904 0.002 0.089 0.002 0.001]]
