# 로지스틱 회귀 구연
 - 1. 데이터 준비
 - 2. 데이터 스케일링
 - 3. 모델 학습
 - 4. 학습 내용 분석

## 1. 데이터 준비

In [1]:
import pandas as pd

In [2]:
fish = pd.read_csv("https://bit.ly/fish_csv")

In [3]:
fish.describe()

Unnamed: 0,Weight,Length,Diagonal,Height,Width
count,159.0,159.0,159.0,159.0,159.0
mean,398.326415,28.415723,31.227044,8.970994,4.417486
std,357.978317,10.716328,11.610246,4.286208,1.685804
min,0.0,8.4,8.8,1.7284,1.0476
25%,120.0,21.0,23.15,5.9448,3.38565
50%,273.0,27.3,29.4,7.786,4.2485
75%,650.0,35.5,39.65,12.3659,5.5845
max,1650.0,63.4,68.0,18.957,8.142


In [4]:
fish_input = fish[["Weight",	"Length",	"Diagonal",	"Height",	"Width"]]

In [5]:
fish_target = fish[["Species"]]

In [None]:
len(fish_target), len(fish_input)

(159, 159)

### 1-1 학습데이터, 평가데이터 분리하기

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_input, test_input, train_target, test_target = train_test_split(fish_input, fish_target, random_state=42)


## 2. 데이터 스케일링
 - 표준점수

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)

## 3. 모델 학습
 - 모델 선언: 어떤 모델을 사용할 것인가
 - 모델 학습: 학습데이터로 모델을 학습한다.
 - 모델 평가: 평가데이터로 모델을 학습한다.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
kn = KNeighborsClassifier(n_neighbors=3)

In [None]:
kn.fit(train_scaled, train_target)

  """Entry point for launching an IPython kernel.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [None]:
print(kn.score(train_scaled, train_target)) #학습 데이터에 대한 성능

0.8907563025210085


In [None]:
print(kn.score(test_scaled, test_target)) #평가 데이터에 대한 성능

0.85


In [None]:
kn.classes_

array(['Bream', 'Parkki', 'Perch', 'Pike', 'Roach', 'Smelt', 'Whitefish'],
      dtype=object)

In [None]:
kn.predict(test_scaled[:5])

array(['Perch', 'Smelt', 'Pike', 'Perch', 'Perch'], dtype=object)

In [None]:
test_scaled[5]

array([1.70778862, 0.79685229, 0.98412932, 2.49283113, 1.31347159])

In [None]:
kn.predict_proba(test_scaled[4:5])

array([[0.        , 0.        , 0.66666667, 0.        , 0.33333333,
        0.        , 0.        ]])

In [None]:
test_input[4:5]

Unnamed: 0,Weight,Length,Diagonal,Height,Width
94,150.0,23.0,24.5,5.2185,3.626


## 4. 로지스틱 회귀 모델
 - 이진분류
 - Multi Class 분류

In [None]:
(train_target == 'Bream') | (train_target == 'Smelt')

Unnamed: 0,Species
26,True
137,False
146,True
90,False
66,False
...,...
71,False
106,False
14,True
92,False


In [None]:
fish_input = fish[['Weight','Length','Diagonal','Height','Width']].to_numpy()
fish_target = fish['Species'].to_numpy()

from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    fish_input, fish_target, random_state=42)

from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)

bream_smelt_indexes = (train_target == 'Bream') | (train_target == 'Smelt')

In [None]:
train_bream_smelt = train_scaled[bream_smelt_indexes]
target_bream_smelt = train_target[bream_smelt_indexes]

In [None]:
import numpy as np
char_arr = np.array(['A', 'B', 'C', 'D', 'E'])
print(char_arr[[True, False, True, False, False]])

['A' 'C']


In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression(C=20)

In [None]:
lr.fit(train_bream_smelt, target_bream_smelt)

LogisticRegression(C=20, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
lr.score(train_bream_smelt, target_bream_smelt)

1.0

In [None]:
lr.predict(train_bream_smelt[:2])

array(['Bream', 'Smelt'], dtype=object)

In [None]:
print(lr.predict_proba(train_bream_smelt[:2]))

[[9.99975211e-01 2.47892835e-05]
 [1.66616319e-03 9.98333837e-01]]


In [None]:
print(lr.classes_)

['Bream' 'Smelt']


## 로지스틱 회귀의 분석

In [None]:
lr.classes_

array(['Bream', 'Smelt'], dtype=object)

In [None]:
(lr.coef_, lr.intercept_)

(array([[-0.64406098, -0.99701136, -1.17418373, -1.87025439, -1.26986516]]),
 array([-3.71913364]))

In [None]:
decisions = lr.decision_function(train_bream_smelt[:5])
print(decisions)

[-10.60507433   6.39556424  -9.27080659  -7.47722526 -10.66647737]


In [None]:
from scipy.special import expit

In [None]:
expit(decisions)

array([2.47892835e-05, 9.98333837e-01, 9.41236970e-05, 5.65505278e-04,
       2.33129703e-05])

## 다중회귀

In [None]:
lr = LogisticRegression(C=20)

In [None]:
lr.fit(train_scaled, train_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=20, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
lr.classes_

array(['Bream', 'Parkki', 'Perch', 'Pike', 'Roach', 'Smelt', 'Whitefish'],
      dtype=object)

In [None]:
lr.score(train_scaled, train_target)

0.9327731092436975

In [None]:
lr.score(test_scaled, test_target)

0.925

In [None]:
proba = lr.predict_proba(test_scaled[:1])

In [None]:
proba

array([[7.25127862e-06, 1.35065840e-02, 8.41218287e-01, 3.15236283e-04,
        1.35704396e-01, 6.69182595e-03, 2.55641887e-03]])

In [None]:
lr.predict(test_scaled[:1])

array(['Perch'], dtype=object)

In [None]:
z = lr.decision_function(test_scaled[:1])

In [None]:
from scipy.special import softmax

In [None]:
a = softmax(z)

In [None]:
np.sum(a)

1.0000000000000002