### PART 04) 머신러닝

## 1장. 지도학습모형

### 3절. 선형모델

#### 1. 사이킷런을 활용한 선형 회귀모형

#### 가. 다중 선형 회귀모형

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_diabetes
import pandas as pd

# diabetes 데이터셋 호출 후 데이터프레임으로 변환
diabetes = load_diabetes()
data = pd.DataFrame(diabetes.data, columns = diabetes.feature_names)
target = pd.Series(diabetes.target, name = 'target')

colnm = ['bmi', 'bp', 's1', 's2', 's3'] # 컬럼명 리스트
X = data[colnm]
y = target

# 선형회귀 객체 생성
model = LinearRegression()

# 선형회귀 적합
model.fit(X = X, y = y)

# 독립변수들에 대한 추정 회귀 계수들
print(model.coef_)

# 절편항에 대한 추정 회귀 계수
print(model.intercept_)

# 결정계수
model.score(X = X, y = y)

[ 608.9434702   301.13217977  990.86949656 -938.98139897 -597.46405077]
152.13348416289634


0.47721326169685596

#### 나. 릿지(Ridge)

In [4]:
from sklearn.linear_model import Ridge
from sklearn.datasets import load_diabetes
import pandas as pd

# diabetes 데이터셋 호출 후 데이터프레임으로 변환
diabetes = load_diabetes()
data = pd.DataFrame(diabetes.data, columns = diabetes.feature_names)
target = pd.Series(diabetes.target, name = 'target')

colnm = ['bmi', 'bp', 's1', 's2', 's3'] # 컬럼명 리스트
X = data[colnm]
y = target

# 릿지회귀객체 생성
model = Ridge(alpha = 0.1)

# 적합
model.fit(X = X, y = y)

# 독립변수들에 대한 추정 회귀 계수들
print(model.intercept_)
print(model.coef_)

152.13348416289628
[ 595.99217991  339.09043254  397.33941231 -338.99997282 -406.34685926]


#### 다. 라쏘(LASSO)

In [5]:
from sklearn.linear_model import Lasso
colnm = ['bmi', 'bp', 's1', 's2', 's3'] # 컬럼명 리스트
X = data[colnm]
y = target

# 라쏘회귀객체 생성
model = Lasso(alpha = 0.5)

# 적합
model.fit(X = X, y = y)
Lasso(alpha=0.5)

# 독립변수들에 대한 추정 회귀 계수들
print(model.intercept_)
print(model.coef_)

152.13348416289628
[ 574.0437769   237.23009748    0.            0.         -165.17218128]


In [6]:
# 패키지 및 데이터셋, 클래스 호출
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Lasso

# diabetes 데이터셋 호출 후 데이터프레임으로 변환
diabetes = load_diabetes()
data = pd.DataFrame(diabetes.data, columns = diabetes.feature_names)
target = pd.Series(diabetes.target, name = 'target')
df = pd.concat([data, target], axis = 1) # 데이터프레임과 시리즈를 열 결합

In [7]:
# 데이터 분할
colnm = ['bmi', 'bp', 's1', 's2', 's3'] # 컬럼명 리스트
X_train = df[colnm].loc[:310] # 0~309번 행과 ‘bmi’,‘bp’,‘s1’,‘s2’,‘s3’ 컬럼
X_test = df[colnm].loc[310:] # 310~441번 행 ‘bmi’,‘bp’,‘s1’,‘s2’,‘s3’ 컬럼
y_train = df['target'].loc[:310] # 0~309번 행과 'target’ 컬럼

# 라쏘회귀객체 생성
model = Lasso(alpha = 0.5)
model.fit(X = X_train, y = y_train) # X_train과 y_train으로 라쏘 회귀모형 적합

# X_test를 통해 새로운 ‘target’ 변수를 예측
target = model.predict(X_test)
target = pd.Series(target, name = 'target') # array -> series
print(target)

0      198.788299
1      166.213989
2      133.539119
3      194.702166
4      172.636431
          ...    
127    178.910748
128    131.853733
129    148.587458
130    178.546607
131     65.610946
Name: target, Length: 132, dtype: float64


#### 2. 사이킷런을 활용한 로지스틱 회귀모형

In [8]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# breast_cancer 데이터셋 호출
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
data = breast_cancer.data
target = breast_cancer.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205, stratify = target)

# 모형객체 생성
lr_bin = LogisticRegression(C = 0.5, max_iter = 2000) # 수렴까지 걸리는 최대 반복 횟수

# 모델학습
model_lr_bin = lr_bin.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# ROC
from sklearn.metrics import roc_curve, auc
y_score = model_lr_bin.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_score)

# AUC
AUC = auc(fpr, tpr) # roc_curve()에서 반환된 fpr을 x축, tpr을 y축
print(AUC)

0.9983465608465608


In [11]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# iris 데이터셋 호출
from sklearn.datasets import load_iris
iris = load_iris()
data = iris.data
target = iris.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205, stratify = target)

# 모형객체 생성
lr_multi = LogisticRegression(C = 0.05, max_iter = 200) # 수렴까지 걸리는 최대 반복 횟수

# 모델학습
model_lr_multi = lr_multi.fit(X_train, y_train)

In [12]:
# macro f1-score
from sklearn.metrics import f1_score
y_pred = model_lr_multi.predict(X_test)
macro_f1 = f1_score(y_test, y_pred, average = "macro")
print(macro_f1)

0.9665831244778613


---

### 4절. 서포트벡터머신

In [8]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# breast_cancer 데이터셋 호출
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
data = breast_cancer.data
target = breast_cancer.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2,random_state = 2205, stratify = target)

# 모형객체 생성
# 메소드 .predict_proba()의 사용을 위해서 probability = True 입력 필요
svm_bin = SVC(kernel = 'linear', C = 0.5, probability = True)

# 모델학습
model_svm_bin = svm_bin.fit(X_train, y_train)

In [9]:
# ROC
from sklearn.metrics import roc_curve, auc
y_score = model_svm_bin.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_score)

# AUC
AUC = auc(fpr, tpr) # roc_curve()에서 반환된 fpr을 x축, tpr을 y축
print(AUC)

0.9983465608465608


In [10]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

# iris 데이터셋 호출
from sklearn.datasets import load_iris
iris = load_iris()
data = iris.data
target = iris.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205, stratify = target)

# 모형객체 생성
svm_multi = LinearSVC(C = 0.1)

# 모델학습
model_svm_multi = svm_multi.fit(X_train, y_train)

In [21]:
# macro f1-score
from sklearn.metrics import f1_score
y_pred = model_svm_multi.predict(X_test)
macro_f1 = f1_score(y_test, y_pred, average = "macro")
print(macro_f1)

0.9665831244778613


In [11]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.svm import SVR, LinearSVR
from sklearn.model_selection import train_test_split

# diabetes 데이터셋 호출
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
data = diabetes.data
target = diabetes.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205)

# 모형객체 생성
svm_conti_1 = SVR(C = 0.1, epsilon = 0.01)
svm_conti_2 = LinearSVR(C = 0.1, loss = 'squared_epsilon_insensitive')

# 모델학습
model_svm_conti_1 = svm_conti_1.fit(X_train, y_train)
model_svm_conti_2 = svm_conti_2.fit(X_train, y_train)

In [12]:
# RMSE
from sklearn.metrics import mean_squared_error
y_pred_1 = model_svm_conti_1.predict(X_test)
rmse_1 = mean_squared_error(y_test, y_pred_1, squared = False)
print(rmse_1)

y_pred_2 = model_svm_conti_2.predict(X_test)
rmse_2 = mean_squared_error(y_test, y_pred_2, squared = False)
print(rmse_2)

82.99258100164352
72.94449806071371
