In [5]:
import numpy as np
from sklearn.datasets import load_linnerud
import pandas as pd

import scipy as sp
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing

In [6]:
#データの詳細
linnerud = load_linnerud()
print(linnerud.DESCR)
linnerud.frame

.. _linnerrud_dataset:

Linnerrud dataset
-----------------

**Data Set Characteristics:**

    :Number of Instances: 20
    :Number of Attributes: 3
    :Missing Attribute Values: None

The Linnerud dataset is a multi-output regression dataset. It consists of three
excercise (data) and three physiological (target) variables collected from
twenty middle-aged men in a fitness club:

- *physiological* - CSV containing 20 observations on 3 physiological variables:
   Weight, Waist and Pulse.
- *exercise* - CSV containing 20 observations on 3 exercise variables:
   Chins, Situps and Jumps.

.. topic:: References

  * Tenenhaus, M. (1998). La regression PLS: theorie et pratique. Paris:
    Editions Technic.



In [7]:
"""
説明変数
Chins　懸垂
Situps　上体起こし
Jumps　ジャンプ回数
"""
linnerudFrame = pd.DataFrame(linnerud.data,columns=linnerud.feature_names)
linnerudFrame.head(5)

Unnamed: 0,Chins,Situps,Jumps
0,5.0,162.0,60.0
1,2.0,110.0,60.0
2,12.0,101.0,101.0
3,12.0,105.0,37.0
4,13.0,155.0,58.0


In [8]:
"""
target変数
Weight　体重　単位はおそらくポンド
Waist　ウエスト　おそらくインチ
Pulse　心拍数
"""
linnerudTarget = pd.DataFrame(linnerud.target,columns=linnerud.target_names)
linnerudTarget.head(5)

Unnamed: 0,Weight,Waist,Pulse
0,191.0,36.0,50.0
1,189.0,37.0,52.0
2,193.0,38.0,58.0
3,162.0,35.0,62.0
4,189.0,35.0,46.0


In [9]:
#運動データ三種から身体データを重回帰分析してみる

In [10]:
#体重の推定

In [11]:
x1 = linnerudFrame[["Chins", "Situps","Jumps"]]
x1.head(5)

Unnamed: 0,Chins,Situps,Jumps
0,5.0,162.0,60.0
1,2.0,110.0,60.0
2,12.0,101.0,101.0
3,12.0,105.0,37.0
4,13.0,155.0,58.0


In [12]:
y1 = linnerudTarget[["Weight"]]
y1.head(5)

Unnamed: 0,Weight
0,191.0
1,189.0
2,193.0
3,162.0
4,189.0


In [13]:
#データを0~1にmin-maxスケーリングする
x1Scaled = x1.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
y1Scaled = y1.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))

In [14]:
x1Scaled.head(5)

Unnamed: 0,Chins,Situps,Jumps
0,0.25,0.557214,0.155556
1,0.0625,0.298507,0.155556
2,0.6875,0.253731,0.337778
3,0.6875,0.273632,0.053333
4,0.75,0.522388,0.146667


In [15]:
y1Scaled.head(5)

Unnamed: 0,Weight
0,0.486239
1,0.46789
2,0.504587
3,0.220183
4,0.46789


In [16]:
#分割
x1Train, x1Test, y1Train, y1Test = train_test_split(x1Scaled, y1Scaled, test_size=0.2, random_state=0)

In [17]:
print(x1Train.shape)
print(x1Test.shape)
print(y1Train.shape)
print(y1Test.shape)

(16, 3)
(4, 3)
(16, 1)
(4, 1)


In [18]:
#重回帰で学習
modelForWeight = LinearRegression()
modelForWeight.fit(x1Train, y1Train)

LinearRegression()

In [19]:
#実際にテスト用データで予測してみる
yPredict = modelForWeight.predict(x1Test)
yUnscaled =  yPredict*(np.max(y1.values) - np.min(y1.values)) + np.min(y1.values)
yUnscaled

array([[153.63090772],
       [203.26736591],
       [200.86145905],
       [153.90911951]])

In [20]:
#対応する正解データ
y1TestUnscaled = y1Test[:5]*(np.max(y1.values) - np.min(y1.values)) + np.min(y1.values)
y1TestUnscaled

Unnamed: 0,Weight
18,156.0
1,189.0
19,138.0
8,176.0


In [21]:
#平均二乗誤差を見てみる
print("RMSE...", np.sqrt(mean_squared_error(yUnscaled, y1TestUnscaled)))

RMSE... 34.09083587423488


In [22]:
#係数を見る
modelForWeight.coef_

array([[-0.32421354, -0.36500874,  0.29213656]])

In [34]:
"""
懸垂回数、上体起こしが減少…体重の増加
ジャンプ回数の増加…体重の増加
"""

'\n懸垂回数、上体起こしが減少…体重の増加\nジャンプ回数の増加…体重の増加\n'

In [23]:
#今度はウエストの推定

In [24]:
x2 = linnerudFrame[["Chins", "Situps","Jumps"]]
y2 = linnerudTarget[["Waist"]]
y2

Unnamed: 0,Waist
0,36.0
1,37.0
2,38.0
3,35.0
4,35.0
5,36.0
6,38.0
7,34.0
8,31.0
9,33.0


In [25]:
#データを0~1にmin-maxスケーリングする
x2Scaled = x2.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
y2Scaled = y2.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))

In [26]:
x2Train, x2Test, y2Train, y2Test = train_test_split(x2Scaled, y2Scaled, test_size=0.2, random_state=0)

In [27]:
#学習
modelForWaist = LinearRegression()
modelForWaist.fit(x2Train, y2Train)

LinearRegression()

In [28]:
#推定
yPredict = modelForWaist.predict(x2Test)
yPerdictUnScaled = yPredict*(np.max(y2.values) - np.min(y2.values)) + np.min(y2.values)
yPerdictUnScaled

array([[31.11656185],
       [38.83382692],
       [38.31456627],
       [31.07098739]])

In [29]:
#正解
y2TestUnScaled = y2Test*(np.max(y2.values) - np.min(y2.values)) + np.min(y2.values)
y2TestUnScaled

Unnamed: 0,Waist
18,33.0
1,37.0
19,33.0
8,31.0


In [30]:
#平均二乗誤差
print("RMSE...", np.sqrt(mean_squared_error(yPerdictUnScaled, y2TestUnScaled)))

RMSE... 2.9647898003042887


In [32]:
#回帰係数
modelForWaist.coef_

array([[-0.30254746, -0.51584752,  0.45817116]])

In [33]:
"""
懸垂回数、上体起こしが減少…ウエストの増加
ジャンプ回数の増加…ウエストの増加
"""

'\n懸垂回数、上体起こしが減少…ウエストの増加\nジャンプ回数の増加…ウエストの増加\n'