# 머신러닝 실습 

## Ch2. Linear regression

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [8]:
diabetes = load_diabetes()
diabetes_DF = pd.DataFrame( diabetes['data'], columns=diabetes['feature_names'])
diabetes_DF['Y']=diabetes['target']
diabetes_DF.head(5)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,Y
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [9]:
diabetes_DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
 10  Y       442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB


In [10]:
y_target = diabetes_DF['Y']
X_data = diabetes_DF.drop(['Y'], axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(
X_data, y_target, test_size=0.4, random_state=123 )

In [11]:
lr = LinearRegression()
lr.fit ( X_train, y_train )

In [12]:
lr.intercept_

151.71551041484278

In [13]:
np.round( lr.coef_, decimals=1)

array([ -11.1, -291.1,  553.8,  296.6, -915. ,  528.4,  210.2,  339.6,
        640.6,  115.7])

In [14]:
coeff = pd.Series( data= np.round( lr.coef_, decimals=1), index=X_data.columns )
coeff.sort_values(ascending=False)

s5     640.6
bmi    553.8
s2     528.4
s4     339.6
bp     296.6
s3     210.2
s6     115.7
age    -11.1
sex   -291.1
s1    -915.0
dtype: float64

In [15]:
y_preds = lr.predict( X_test )
mse = mean_squared_error( y_test, y_preds )
rmse = np.sqrt( mse )
rmse

55.09404732888505

In [16]:
r2 = r2_score( y_test, y_preds )
r2

0.4933408690435077

In [17]:
y_train_preds = lr.predict( X_train )
mse_train = mean_squared_error( y_train, y_train_preds )
rmse_train = np.sqrt( mse_train )
rmse_train

52.9486429330168

In [18]:
r2_train = r2_score( y_train, y_train_preds )
r2_train

0.5237974491641986

In [19]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True )
kfid = kf.split(X_data)

In [20]:
kf_mse = []
for train_i, test_i in kfid:
    X_trn, X_tst = X_data.iloc[train_i], X_data.iloc[test_i]
    y_trn, y_tst = y_target.iloc[train_i], y_target.iloc[test_i]
    lr = LinearRegression()
    lr.fit ( X_trn, y_trn )
    y_preds = lr.predict( X_tst )
    mse = mean_squared_error( y_tst, y_preds )
    kf_mse.append(mse)
kf_mse

[2473.5516319387098,
 3233.267125885358,
 3553.038452271323,
 2979.5920996281343,
 2621.972092449679]

In [21]:
kf_rmse = np.sqrt(kf_mse)
np.mean(kf_rmse)

54.398968609104465

In [22]:
from sklearn.model_selection import cross_val_score
neg_mse_scores= cross_val_score(lr, X_data, y_target,
                                scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt( -1 * neg_mse_scores )
rmse_scores

array([52.72497937, 55.03486476, 56.90068179, 54.85204179, 53.94638716])

In [23]:
np.mean( rmse_scores )

54.69179097275793

In [24]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
dX=diabetes['data']
dy=diabetes['target']
scaler.fit( dX )
diabetes_X_scaled = scaler.transform( dX )
np.round( diabetes_X_scaled[:3], decimals=2 )

array([[ 0.8 ,  1.07,  1.3 ,  0.46, -0.93, -0.73, -0.91, -0.05,  0.42,
        -0.37],
       [-0.04, -0.94, -1.08, -0.55, -0.18, -0.4 ,  1.56, -0.83, -1.44,
        -1.94],
       [ 1.79,  1.07,  0.93, -0.12, -0.96, -0.72, -0.68, -0.05,  0.06,
        -0.55]])

In [25]:
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor ( max_iter=50, penalty=None, eta0=0.1 )
sgd_reg.fit( diabetes_X_scaled, dy )
print(sgd_reg.intercept_, np.round( sgd_reg.coef_, decimals=1), sep="\n")

[153.41928257]
[ -0.2 -12.2  23.5  14.  -26.7  14.2   2.1   5.7  38.2   2.7]
