In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 데이터 로딩

In [48]:
fish = pd.read_csv('C:/k_digital/source/Machine Learning with Python/fish.csv')
fish.head()

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


In [39]:
fish['Species'].unique()

array(['Bream', 'Roach', 'Whitefish', 'Parkki', 'Perch', 'Pike', 'Smelt'],
      dtype=object)

In [40]:
pd.unique(fish['Species'])

array(['Bream', 'Roach', 'Whitefish', 'Parkki', 'Perch', 'Pike', 'Smelt'],
      dtype=object)

In [49]:
fish_input = fish[['Weight','Length', 'Diagonal','Height','Width']].to_numpy()

In [42]:
fish_input[:5]

array([[242.    ,  25.4   ,  30.    ,  11.52  ,   4.02  ],
       [290.    ,  26.3   ,  31.2   ,  12.48  ,   4.3056],
       [340.    ,  26.5   ,  31.1   ,  12.3778,   4.6961],
       [363.    ,  29.    ,  33.5   ,  12.73  ,   4.4555],
       [430.    ,  29.    ,  34.    ,  12.444 ,   5.134 ]])

In [50]:
fish_target = fish['Species'].to_numpy()

In [44]:
fish_target[:5]

array(['Bream', 'Bream', 'Bream', 'Bream', 'Bream'], dtype=object)

## 훈련세트와 테스트 세트로 분리

In [51]:
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(
fish_input, fish_target, random_state = 42)

## 피처 스케일링

In [52]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)

## k-최근접 이웃 분류기를 이용한 확률 예측

In [17]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(train_scaled, train_target)

print(knn.score(train_scaled, train_target))
print(knn.score(test_scaled, test_target))

0.8907563025210085
0.85


In [21]:
knn.classes_

array(['Bream', 'Parkki', 'Perch', 'Pike', 'Roach', 'Smelt', 'Whitefish'],
      dtype=object)

In [20]:
print(knn.predict(test_scaled[:5]))

['Perch' 'Smelt' 'Pike' 'Perch' 'Perch']


In [22]:
# predict_proba():클래스별 확률값을 변환해주는 메서드
proba = knn.predict_proba(test_scaled[:5])
# decimals : 유지할 소숫점 자릿수
print(np.round(proba, decimals=4))

[[0.     0.     1.     0.     0.     0.     0.    ]
 [0.     0.     0.     0.     0.     1.     0.    ]
 [0.     0.     0.     1.     0.     0.     0.    ]
 [0.     0.     0.6667 0.     0.3333 0.     0.    ]
 [0.     0.     0.6667 0.     0.3333 0.     0.    ]]


In [23]:
distances, indexes = knn.kneighbors(test_scaled[3:4])
print(train_target[indexes])

[['Roach' 'Perch' 'Perch']]


## 로지스틱 회귀
- 대표적인 분류 알고리즘
- 인공신경망에 기본이 되는 알고리즘
- 선형회귀와 비슷하게 선형 방정식을 학습하는 알고리즘이다.

### 로지스틱 회귀(이진분류)

In [24]:
arr = np.array(['A', 'B', 'C', 'D', 'E'])
# boolean indexing
boolean = [True, False, True, False, False]
arr[boolean]

array(['A', 'C'], dtype='<U1')

In [25]:
fish.head()

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


In [27]:
print(train_input.shape, train_target.shape)

(119, 5) (119,)


In [28]:
print(test_input.shape, test_target.shape)

(40, 5) (40,)


In [26]:
# Bream, Smelt 데이터만 이용한 이진분류 학습
indexes = (train_target == 'Bream') | (train_target == 'Smelt')
train = train_scaled[indexes]
target = train_target[indexes]

In [29]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train, target)

LogisticRegression()

In [30]:
lr.predict(train[:5])

array(['Bream', 'Smelt', 'Bream', 'Bream', 'Bream'], dtype=object)

In [31]:
lr.classes_

array(['Bream', 'Smelt'], dtype=object)

In [32]:
# 예측확률
lr.predict_proba(train[:5])

array([[0.99759855, 0.00240145],
       [0.02735183, 0.97264817],
       [0.99486072, 0.00513928],
       [0.98584202, 0.01415798],
       [0.99767269, 0.00232731]])

In [33]:
# 기울기와 절편
print(lr.coef_, lr.intercept_)

[[-0.4037798  -0.57620209 -0.66280298 -1.01290277 -0.73168947]] [-2.16155132]


In [35]:
# z값 계산
decisions = lr.decision_function(train[:5])
decisions

array([-6.02927744,  3.57123907, -5.26568906, -4.24321775, -6.0607117 ])

In [37]:
# 시그모이드 함수를 통해 확률값 추출
from scipy.special import expit
expit(decisions)

array([0.00240145, 0.97264817, 0.00513928, 0.01415798, 0.00232731])

## 로지스틱 회귀(다중분류)

In [53]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=20, max_iter=1000)
lr.fit(train_scaled, train_target)
print(lr.score(train_scaled, train_target))
print(lr.score(test_scaled, test_target))

0.9327731092436975
0.925


In [54]:
lr.predict(test_scaled[:5])

array(['Perch', 'Smelt', 'Pike', 'Roach', 'Perch'], dtype=object)

In [56]:
proda = lr.predict_proba(test_scaled[:5])
np.round(proba. decimals=3)

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (Temp/ipykernel_11912/2138945841.py, line 2)