## 로지스틱 회귀
- 회귀(숫자 예측) & 그에 따른 분류
- 목적 : 손실 함수(Loss Function) 최소화  ->  `경사 하강법(Gradient Descent)` 사용
    - 시그모이드 함수 : 결과값을 0 ~ 1사이의 확률로 변환하는 함수 -> 이진 분류
    - 소프트 맥스 함수 : 다중 분류 (여러개의 클래스)
- ex) 나머지 특성 활용한 물고기 종류가 도미인지, 빙어인지 분류

In [1]:
import pandas as pd

df = pd.read_csv('data/Fish.csv')
df

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.5200,4.0200
1,Bream,290.0,24.0,26.3,31.2,12.4800,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.7300,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...,...,...
154,Smelt,12.2,11.5,12.2,13.4,2.0904,1.3936
155,Smelt,13.4,11.7,12.4,13.5,2.4300,1.2690
156,Smelt,12.2,12.1,13.0,13.8,2.2770,1.2558
157,Smelt,19.7,13.2,14.3,15.2,2.8728,2.0672


In [3]:
df['Species'].unique()

array(['Bream', 'Roach', 'Whitefish', 'Parkki', 'Perch', 'Pike', 'Smelt'],
      dtype=object)

In [5]:
fish_input = df[['Weight', 'Length2', 'Length3', 'Height', 'Width']]
fish_input.head()

Unnamed: 0,Weight,Length2,Length3,Height,Width
0,242.0,25.4,30.0,11.52,4.02
1,290.0,26.3,31.2,12.48,4.3056
2,340.0,26.5,31.1,12.3778,4.6961
3,363.0,29.0,33.5,12.73,4.4555
4,430.0,29.0,34.0,12.444,5.134


In [6]:
fish_target = df[['Species']]

In [7]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = \
train_test_split(fish_input, fish_target)

In [8]:
# 특성들의 scale 수준 확인하고 많이 차이나면 정규화하기

from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(train_input)

train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)

In [13]:
# K - 최근접 분류
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier()
kn.fit(train_scaled, train_target)

  return self._fit(X, y)


In [14]:
print(kn.score(train_scaled, train_target))
print(kn.score(test_scaled, test_target))

0.8823529411764706
0.75


In [16]:
kn.predict(test_scaled)

array(['Perch', 'Perch', 'Parkki', 'Perch', 'Bream', 'Perch', 'Smelt',
       'Perch', 'Parkki', 'Bream', 'Perch', 'Perch', 'Perch', 'Smelt',
       'Bream', 'Roach', 'Perch', 'Smelt', 'Smelt', 'Whitefish', 'Bream',
       'Perch', 'Bream', 'Perch', 'Bream', 'Perch', 'Bream', 'Bream',
       'Perch', 'Bream', 'Perch', 'Roach', 'Perch', 'Bream', 'Bream',
       'Smelt', 'Whitefish', 'Pike', 'Perch', 'Perch'], dtype=object)

In [15]:
# 예측 확률을 볼 수 있음.
kn.predict_proba(test_scaled)

array([[0. , 0. , 1. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 1. , 0. , 0. , 0. , 0. ],
       [0. , 1. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0.6, 0. , 0.4, 0. , 0. ],
       [1. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0.2, 0.6, 0. , 0. , 0.2, 0. ],
       [0. , 0. , 0. , 0. , 0. , 1. , 0. ],
       [0. , 0. , 1. , 0. , 0. , 0. , 0. ],
       [0. , 0.4, 0.4, 0. , 0.2, 0. , 0. ],
       [1. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 1. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0.8, 0. , 0.2, 0. , 0. ],
       [0. , 0. , 0.6, 0. , 0.4, 0. , 0. ],
       [0. , 0. , 0.2, 0. , 0. , 0.8, 0. ],
       [1. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0.2, 0. , 0.8, 0. , 0. ],
       [0. , 0. , 0.8, 0. , 0.2, 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 1. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 1. , 0. ],
       [0. , 0. , 0.4, 0. , 0. , 0. , 0.6],
       [1. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0.6, 0. , 0. , 0. , 0.4],
       [1. , 0. , 0. , 0. , 0. ,

### 연습문제 1)
- 이진분류 (도미인지, 빙어인지 확인) -> 도미인지만 확인하면 됨.

In [18]:
train_scaled
train_target

Unnamed: 0,Species
17,Bream
37,Roach
151,Smelt
82,Perch
63,Parkki
...,...
84,Perch
109,Perch
29,Bream
57,Whitefish


In [36]:
# bream 과 smelt만 뽑아내기 / | = or (A | B → A 또는 B가 True면 True 반환)
bream_smelt_indexes = (train_target == 'Bream') | (train_target == 'Smelt')


train_bs = train_scaled[bream_smelt_indexes['Species']]
target_bs = train_target[bream_smelt_indexes['Species']]
train_bs                      # 표준화된 값이므로 평균보다 각각 무게, 길이2, 길이3, 높이, 너비가 크고 작은지 해석 할 수 있음. 

array([[ 0.77185859,  0.36917065,  0.56846693,  1.42261852,  0.45892272],
       [-1.06451015, -1.55072982, -1.56044911, -1.57383372, -1.90028656],
       [ 0.63878839,  0.41445132,  0.58523004,  1.31330209,  0.77847119],
       [ 0.71863051,  0.55029334,  0.74447967,  1.54757176,  1.02140277],
       [ 0.2395778 ,  0.16087956,  0.37569106,  1.25508792,  0.31539416],
       [-1.03816225, -1.26093353, -1.30062077, -1.40467821, -1.54137473],
       [-1.05546138, -1.49639302, -1.52692287, -1.52294342, -1.90920811],
       [-1.06504243, -1.60506663, -1.61912002, -1.60461282, -1.90426509],
       [-1.07249437, -1.65940344, -1.68617249, -1.68816618, -1.98190675],
       [ 0.2395778 ,  0.07031822,  0.23320456,  1.24399614,  0.50775016],
       [-1.06530858, -1.6231789 , -1.65264626, -1.57804907, -1.84229644],
       [-1.06504243, -1.53261756, -1.55206755, -1.57607092, -1.98190675],
       [ 0.77185859,  0.36917065,  0.55170381,  1.40434417,  0.51190953],
       [ 0.83839369,  0.55029334,  0.7

In [28]:
# 로지스틱 회귀

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_bs, target_bs)

lr.predict(train_bs[:5])
lr.predict_proba(train_bs[:5])

  y = column_or_1d(y, warn=True)


array([[0.99229874, 0.00770126],
       [0.04058848, 0.95941152],
       [0.99316823, 0.00683177],
       [0.99636161, 0.00363839],
       [0.98402172, 0.01597828]])

### 연습문제 2) 
- 다중 분류 :
- 로지스틱 회귀(LogisticRegression)
    - C (커질수록 규제가 작아짐)
    - max_iter (반복 정도) : 기본 100
- `소프트맥스 함수 : 데이터가 여러개일때 총합을 0~1로 만들어줌 (다중 분류에서 사용)`

In [30]:
lr = LogisticRegression(C=20, max_iter = 1000)
lr.fit(train_scaled, train_target)

print(lr.score(train_scaled, train_target))
print(lr.score(test_scaled, test_target))

0.9327731092436975
0.85


  y = column_or_1d(y, warn=True)


In [32]:
lr.predict_proba(test_scaled[:5]).round(3)

array([[0.005, 0.   , 0.96 , 0.   , 0.003, 0.   , 0.031],
       [0.004, 0.   , 0.958, 0.001, 0.001, 0.   , 0.035],
       [0.008, 0.906, 0.003, 0.   , 0.062, 0.   , 0.021],
       [0.   , 0.004, 0.828, 0.002, 0.143, 0.001, 0.022],
       [0.996, 0.001, 0.   , 0.   , 0.   , 0.   , 0.003]])

In [33]:
lr.coef_.shape

(7, 5)