In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_linnerud
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error

In [34]:
t = load_linnerud()
print(t.DESCR)
print(t['data'])
print(t['target'])

.. _linnerrud_dataset:

Linnerrud dataset
-----------------

**Data Set Characteristics:**

    :Number of Instances: 20
    :Number of Attributes: 3
    :Missing Attribute Values: None

The Linnerud dataset is a multi-output regression dataset. It consists of three
exercise (data) and three physiological (target) variables collected from
twenty middle-aged men in a fitness club:

- *physiological* - CSV containing 20 observations on 3 physiological variables:
   Weight, Waist and Pulse.
- *exercise* - CSV containing 20 observations on 3 exercise variables:
   Chins, Situps and Jumps.

.. topic:: References

  * Tenenhaus, M. (1998). La regression PLS: theorie et pratique. Paris:
    Editions Technic.

[[  5. 162.  60.]
 [  2. 110.  60.]
 [ 12. 101. 101.]
 [ 12. 105.  37.]
 [ 13. 155.  58.]
 [  4. 101.  42.]
 [  8. 101.  38.]
 [  6. 125.  40.]
 [ 15. 200.  40.]
 [ 17. 251. 250.]
 [ 17. 120.  38.]
 [ 13. 210. 115.]
 [ 14. 215. 105.]
 [  1.  50.  50.]
 [  6.  70.  31.]
 [ 12. 210. 120.]


In [35]:
# 몸무게, 키, 심박수로 situp개수를 예측하기 위해 target이랑 데이터 반대로 받아옴

y, X = load_linnerud(return_X_y = True)
print(X) # weight, waist, pulse
print(y.shape) # chins, situps, jumps
y = y[:, 1] # situps, 1차원 배열

[[191.  36.  50.]
 [189.  37.  52.]
 [193.  38.  58.]
 [162.  35.  62.]
 [189.  35.  46.]
 [182.  36.  56.]
 [211.  38.  56.]
 [167.  34.  60.]
 [176.  31.  74.]
 [154.  33.  56.]
 [169.  34.  50.]
 [166.  33.  52.]
 [154.  34.  64.]
 [247.  46.  50.]
 [193.  36.  46.]
 [202.  37.  62.]
 [176.  37.  54.]
 [157.  32.  52.]
 [156.  33.  54.]
 [138.  33.  68.]]
(20, 3)


In [36]:
# weight, waist, pulse를 이용해서 situps 갯수를 예측하는 회귀 모델
# target = situps
# LinearRegresssioni, SGDRegressor 사용, 각각 coef, intercept, r2 값 확인

# independent value --weight, waist, pulse
# dependent value --situp

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 10)
m = LinearRegression()
m.fit(X_train, y_train) # 머신러닝이 끝나면 공식이 완성
print(m.coef_, m.intercept_)

train_score = m.score(X_train, y_train)
print(train_score)

train_r2 = r2_score(y_train, m.predict(X_train))
print(train_r2)

# 보통 검증을 할때는 test data를 사용하여 검증
test_score = m.score(X_test, y_test)
print(test_score)
test_r2 = r2_score(y_test, m.predict(X_test))
print(test_r2)

# train score는 괜찮은 데, test데이터를 넣었을때 지금과 같이 error값이 더 높게 나오는 현상이 '과적합'이다.

[  0.60388495 -16.73705076   0.24510045] 628.1309484144331
0.48951928525028654
0.48951928525028654
-20.940403953059818
-20.940403953059818


In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 10)
m = SGDRegressor(max_iter =10000)
m.fit(X_train, y_train) # 머신러닝이 끝나면 공식이 완성
print(m.coef_, m.intercept_)


train_score = m.score(X_train, y_train)
print(train_score)

train_r2 = r2_score(y_train, m.predict(X_train))
print(train_r2)

# 보통 검증을 할때는 test data를 사용하여 검증
test_score = m.score(X_test, y_test)
print(test_score)
test_r2 = r2_score(y_test, m.predict(X_test))
print(test_r2)

[-1.80050315e+11 -3.41914897e+10  1.15781885e+11] [-5.68294058e+09]
-1.7184645514855832e+23
-1.7184645514855832e+23
-7.096955993550115e+24
-7.096955993550115e+24


In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 10)


for m in [LinearRegression(), SGDRegressor(max_iter=1000)]:
    m.fit(X_train, y_train)
    print(m.coef_, m.intercept_)


    train_score = m.score(X_train, y_train)
    print(train_score)

    train_r2 = r2_score(y_train, m.predict(X_train))
    print(train_r2)

    # 보통 검증을 할때는 test data를 사용하여 검증
    test_score = m.score(X_test, y_test)
    print(test_score)
    test_r2 = r2_score(y_test, m.predict(X_test))
    print(test_r2)
    print('=========================================')

[ 0.49402866 -0.27433127  0.05463569] 212.7887714807742
0.3139985280076171
0.3139985280076171
-0.32991213695065946
-0.32991213695065946
[ 1.36976865e+11 -6.25397908e+11  4.50623094e+11] [1.78933491e+10]
-7.027556157787524e+24
-7.027556157787524e+24
-8.172981665256036e+24
-8.172981665256036e+24


In [47]:
#chins, situps, jumps를 이용해 weight 예측 모델

X, y = load_linnerud(return_X_y = True)
y = y[:, 0]

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 10)

for m in [LinearRegression(), SGDRegressor(max_iter=1000)]:
    m.fit(X_train, y_train)
    print(m.coef_, m.intercept_)
    print('train score : ', m.score(X_train , y_train))
    print('test score : ', m.score(X_test , y_test))
    print('=================')

[ 0.49402866 -0.27433127  0.05463569] 212.7887714807742
train score :  0.3139985280076171
test score :  -0.32991213695065946
[-2.32298626e+11  4.62421408e+11  2.85362473e+11] [1.98084567e+10]
train score :  -1.5622950268458486e+25
test score :  -1.159072785503414e+25


***smaple data가 너무 작아서 값이 이렇게 안좋게 나온다.***
- 상관도가 있는지 없는지 scatter plot을 이용하면 된다.

#### overfit, underfit, bias, variance

#### overfit : low bias, high variance
#### underfit : high bias, low variance
#### bias는 실제데이터와 예측값하고의 차이

#### 과적합을 막는 방법은 ridge(L2-norm), Lasso(L1-norm)
#### alpha값을가진다. alpha는 노이즈를 얼마나 줄것인지를 결정하는 변수이다.

In [43]:
# m = Regid(alpha=0.1) 
m = Lasso(alpha=0.1)
m.fit(X_train, y_train) # 머신러닝이 끝나면 공식이 완성
print(m.coef_, m.intercept_)


train_score = m.score(X_train, y_train)
print(train_score)

train_r2 = r2_score(y_train, m.predict(X_train))
print(train_r2)

# 보통 검증을 할때는 test data를 사용하여 검증
test_score = m.score(X_test, y_test)
print(test_score)
test_r2 = r2_score(y_test, m.predict(X_test))
print(test_r2)

[ 0.48264555 -0.27360712  0.05471788] 212.7760120253623
0.31399672117445876
0.31399672117445876
-0.32520242685054224
-0.32520242685054224


In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 10)

for m in [LinearRegression(), SGDRegressor(max_iter=1000), Ridge(alpha = 0.1), Lasso(alpha = 0.1)]:
    m.fit(X_train, y_train)
    print(m.coef_, m.intercept_)
    print('train score : ', m.score(X_train , y_train))
    print('test score : ', m.score(X_test , y_test))
    print('=================')

[ 0.49402866 -0.27433127  0.05463569] 212.7887714807742
train score :  0.3139985280076171
test score :  -0.32991213695065946
[-2.28310451e+10  3.60982745e+11 -4.33087514e+10] [2.74915887e+10]
train score :  -4.969871801830837e+24
test score :  -4.6560884676969315e+24
[ 0.49365994 -0.27430916  0.05464038] 212.78840575465998
train score :  0.31399852613283485
test score :  -0.32975689230914385
[ 0.48264555 -0.27360712  0.05471788] 212.7760120253623
train score :  0.31399672117445876
test score :  -0.32520242685054224
