In [1]:
# 관련 라이브러리 및 모듈 Import
# import
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import SGDRegressor

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

### 데이터 준비

In [2]:
def evaluate_score(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f'mse - {mse:.4f}')
    print(f'rmse - {rmse:.4f}')
    print(f'r2 - {r2:.4f}')
    return

### 데이터 준비 및 분할

In [3]:
# 데이터 csv 불러오기
train = pd.read_csv('../data/train3.csv')

In [4]:
# Unnamed: 0 column제거
train = train.iloc[:,1:]
# 소수점 2자리까지
train.match = round(train.match, 2)

In [5]:
# X(독립변수), Y(종속변수) 분할
X = train.drop('match', axis=1)
y = train['match']
# 학습용 데이터와 평가용 데이터로 분할하기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### 모델 구축

In [6]:
from sklearn.linear_model import LinearRegression
# 회귀 모델 객체 생성
lr = LinearRegression()

# 모델학습
reg = lr.fit(X_train,y_train)

In [7]:
# 회귀 계수, 절편 확인
coef = reg.coef_
intercept = reg.intercept_

In [8]:
y_pred = reg.predict(X_test)
evaluate_score(y_test, y_pred)

mse - 485.0720
rmse - 22.0243
r2 - 0.1480


In [9]:
n = len(X_train)
k = len(X_train.columns)
r2 = r2_score(y_test,y_pred)
adj_r2 = 1 - ((1-r2)*(n-1) / (n-k-1))
print('Adjusted R2: {}'.format(adj_r2))

Adjusted R2: 0.09302049125586154


### SGDRegressor with hyperparameter

In [10]:
reg = SGDRegressor(max_iter=100000000, eta0=0.0001, learning_rate='invscaling', loss= 'squared_error', random_state=42)
reg.fit(X_train, y_train)

print(reg.coef_[0], reg.intercept_[0])

y_pred = reg.predict(X_test)
y_pred[y_pred < 0] = 0.
evaluate_score(y_test, y_pred)

-3560905.531673705 -4488167.268966469
mse - 744966520864917120.0000
rmse - 863114430.9215
r2 - -1308506229229570.2500


### SGDRegressor with scaling

In [11]:
# 표준화 스케일링을 사용하여 경사하강법 모델링 
train_mean = np.mean(X_train, axis=0)
train_std = np.std(X_train, axis=0)

X_train_scaled = (X_train - train_mean) / train_std
X_test_scaled = (X_test - train_mean) / train_std

# 모델 객체 생성
reg = SGDRegressor(max_iter=100000000, eta0=0.0001, learning_rate='invscaling', loss= 'squared_error', random_state=42)

reg.fit(X_train_scaled, y_train)

# 계수 확인
print(reg.coef_, reg.intercept_)

# 평가 지표
y_pred = reg.predict(X_test_scaled)
y_pred[y_pred < 0] = 0.
evaluate_score(y_test, y_pred)

[ 1.09030479  0.89086754  0.51317045  1.53069276  0.41430448  0.46344769
  0.49091966 -0.31230483 -2.0604321   0.43522873  1.95857357  0.1834611
 -0.66876052  2.94355509  0.54137595  0.01908409  0.12051811  0.62760701
  3.95360066  2.25486784] [18.10715864]
mse - 476.4523
rmse - 21.8278
r2 - 0.1631


### SGDRegressor with StandardScaler()

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
# X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Pipeline with StandardScaler, LinearRergression or SGDRegressor

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

reg = make_pipeline(StandardScaler(),
                    SGDRegressor(max_iter=1000000, eta0=0.00001,\
                                 tol=0.0001, random_state=42, loss='squared_error'))
reg.fit(X_train, y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(reg[1].coef_, reg[1].intercept_)

#회귀식 - pipeline()을 사용했기 때문에 SGDRegressor의 parameter가 reg객체의 1번 인덱스에 들어감
print("y = {:2f}X + {:.3f}".format(reg[1].coef_[0], reg[1].intercept_[0]))

# 예측 수행
y_pred = reg.predict(X_test)
y_pred[y_pred < 0] = 0.
from sklearn.metrics import mean_squared_error, r2_score
# MSE, RMSE, r2_score
evaluate_score(y_test, y_pred)

[ 1.04006502  0.83002511  0.55504241  1.47522726  0.40558766  0.4923563
  0.49620876 -0.25926881 -2.00156673  0.41198691  1.92753735  0.18129988
 -0.64930429  2.88390706  0.53224187  0.04601305  0.27681239  0.64971959
  3.79455763  2.25128448] [17.69230148]
y = 1.040065X + 17.692
mse - 474.6706
rmse - 21.7869
r2 - 0.1663
