## 04-1 로지스틱 회귀

---
layout: single
title:  "CHAPTER 04"
categories: AI
---

In [None]:
# k-최근접 이웃 분류기로 럭키백에 들어간 생선의 확률 계산하기
import pandas as pd

fish = pd.read_csv('https://bit.ly/fish_csv_data')
fish.head()

In [None]:
# Species 고유값 추출
print('Species 고유값 추출 :',pd.unique(fish['Species']))

In [None]:
# Species열을 뺀 나머지를 입력데이터로 지정
# 데이터프레임 > numpy 배열로 전환
fish_input = fish[['Weight','Length','Diagonal','Height','Width']].to_numpy()
print('fish_input Data\n',fish_input[:5])

fish_target = fish['Species'].to_numpy()
print('fish_target Data\n',fish_target[:5])

In [None]:
# 분류한 fish데이터를 train 세트와 test세트로 분류

from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(fish_input,fish_target,random_state=42)

In [None]:
# 표준화 전처리

from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)

print('train_input \n',train_input)
print('test_input \n',test_input)

print('train_scaled \n',train_scaled)
print('test_scaled \n',test_scaled)

In [None]:
# k-최근접 이웃 분류기의 확률 예측

from sklearn.neighbors import KNeighborsClassifier
# 이웃 수 : count
count = 3
kn = KNeighborsClassifier(n_neighbors=count)
kn.fit(train_scaled, train_target)
print(kn.score(train_scaled, train_target))
print(kn.score(test_scaled, test_target))

In [None]:
print('target값 확인 \n',kn.classes_)

print(test_scaled[:5])
print('5개 샘플 target값 예측 \n',kn.predict(test_scaled[:5]))

In [None]:
import numpy as np
proba = kn.predict_proba(test_scaled[:5])
print(kn.classes_,'\n',np.round(proba, decimals=4))

In [None]:
# 네 번째 샘플의 최근접 이웃의 클래스 확인
 
distances, indexes = kn.kneighbors(test_scaled[3:4])
print('네 번째 샘플과 가장 이웃한 '+str(count)+'개의 클래스 :',train_target[indexes])

# 로지스틱 회귀

In [None]:
# 시그모이드 함수 그리기

import numpy as np
import matplotlib.pyplot as plt

z = np.arange(-5,5,0.1)
phi = 1/(1+np.exp(-z))
plt.plot(z,phi)
plt.xlabel('z')
plt.ylabel('phi')
plt.show()

In [None]:
# %불리언 인덱싱으로 A,C만 골라내기

char_arr = np.array(['A','B','C','D','E'])
print(char_arr[[True, False, True, False, False]])

In [None]:
# 불리언 인덱싱을 이용하여 Bream과 Smelt행만 골라내기

bream_smelt_indexes = (train_target == 'Bream')|(train_target == 'Smelt')
train_bream_smelt = train_scaled[bream_smelt_indexes]
target_bream_smelt= train_target[bream_smelt_indexes]

In [None]:
# 로지스틱회귀모델 훈련

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_bream_smelt, target_bream_smelt)

# 5개 샘플 예측
print('5개 샘플 예측 :',lr.predict(train_bream_smelt[:5]))
print('예측확률 출력 \n',lr.predict_proba(train_bream_smelt[:5]))
print('Bream과 Smelt중 양성클래스 확인 \n',lr.classes_)
print('계수 확인 : ', lr.coef_, lr.intercept_)
print('샘플 5개 z값 계산 \n',lr.decision_function(train_bream_smelt[:5]))
from scipy.special import expit
print('decisions 배열 확률로 변환 \n',expit(lr.decision_function(train_bream_smelt[:5])))

In [None]:
# 로지스틱 회귀로 다중 분류 수행하기

lr = LogisticRegression(C=20, max_iter=1000)
lr.fit(train_scaled, train_target)
print(lr.score(train_scaled, train_target))
print(lr.score(test_scaled, test_target))
proba = lr.predict_proba(test_scaled[:5])
print('5개 샘플 예측 :',proba)
print('예측확률 출력 \n',np.round(lr.predict_proba(test_scaled[:5]),decimals=3))
print('클래스 정보 확인 \n',lr.classes_)
print('계수 확인 : ', lr.coef_.shape, lr.intercept_.shape)

In [None]:
# 소프트맥스 함수

print('클래스 정보 확인 \n',lr.classes_)
decision = lr.decision_function(test_scaled[:5])
print(np.round(decision, decimals=2))

from scipy.special import softmax

proba = softmax(decision, axis=1)
print(np.round(proba, decimals=3))

## 04-2 확률적 경사 하강법

In [None]:
#PANDAS 데이터 프레임 만들기
import pandas as pd
fish = pd.read_csv('https://bit.ly/fish_csv_data')
fish

In [None]:
# Species 열을 제외한 나머지 5개를 입력데이터로 사용
# Species 열은 타겟 데이터로 사용
fish_input = fish[['Weight','Length','Diagonal','Height','Width']].to_numpy()
fish_target = fish['Species'].to_numpy()

In [None]:
# 훈련세트 테스트세트 지정
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(fish_input,fish_target,random_state=42)

In [None]:
# 표준화 전처리
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)

print('train_scaled : ',train_scaled)
print('test_scaled : ',test_scaled)

In [None]:
#경사하강법 분류 클래스 import
from sklearn.linear_model import SGDClassifier

sc = SGDClassifier(loss='log', max_iter=10, random_state=42)
sc.fit(train_scaled, train_target)
print(sc.score(train_scaled, train_target))
print(sc.score(test_scaled, test_target))

In [None]:
sc.partial_fit(train_scaled, train_target)
print(sc.score(train_scaled, train_target))
print(sc.score(test_scaled, test_target))

In [None]:
# 그래프를 그리기 위한 사전작업

import numpy as np

sc = SGDClassifier(loss = 'log', random_state= 42)
train_score = []
test_score = []
classes = np.unique(train_target)   #생선데이터 목록
print('생선데이터 목록 : ', classes)

# 1에포크가 증가할때마다 train_score, test_score에 점수 추가
for _ in range(0,300):
    sc.partial_fit(train_scaled, train_target, classes=classes)
    train_score.append(sc.score(train_scaled, train_target))
    test_score.append(sc.score(test_scaled, test_target))

In [None]:
#그래프 그리기
import matplotlib.pyplot as plt
plt.plot(train_score)
plt.plot(test_score)
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.show()

In [None]:
# SGDClassifier의 반복횟수를 100에 맞추고 다시 훈련
sc = SGDClassifier(loss='log', max_iter=100, tol=None, random_state=42)
sc.fit(train_scaled, train_target)
print(sc.score(train_scaled, train_target))
print(sc.score(test_scaled, test_target))

In [None]:
# hinge손실을 이용한 모델 훈련
sc = SGDClassifier(loss='hinge', max_iter=100, tol=None, random_state=42)
sc.fit(train_scaled, train_target)
print(sc.score(train_scaled, train_target))
print(sc.score(test_scaled, test_target))