In [None]:
!lscpu

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd

In [None]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
    header=None,
    sep="\s+",
)
df.columns = [
    "CRIM",
    "ZN",
    "INDUS",
    "CHAS",
    "NOX",
    "RM",
    "AGE",
    "DIS",
    "RAD",
    "TAX",
    "PTRATIO",
    "B",
    "LSTAT",
    "MEDV",
]
df.head()

In [None]:
# データの形状
df.shape

In [None]:
# 欠損データの有無
df.isnull().sum()

In [None]:
# データの概要
df.info()

In [None]:
# 1変数EDA
df["MEDV"].describe()

In [None]:
# 住宅価格のヒストグラム
df["MEDV"].hist(bins=30)

In [None]:
# 2変数EDA
plt.figure(figsize=(12, 10))
df_corr = df.corr()
sns.heatmap(df_corr, vmax=1, vmin=-1, center=0, annot=True, cmap="Blues")

In [None]:
num_cols = ["LSTAT", "RM", "MEDV"]
sns.pairplot(df[num_cols], size=2.5)
# sns.pairplot(df[num_cols], height=2.5)

#### 回帰の評価指標
$$
予測値 = \hat{y}_i \\
平均値 = \bar{y} = \frac{1}{n} \sum_{i=1}^{n}y_i
$$

平均二乗誤差
$$
MSE = \frac{1}{n} \sum_{i=1}^{n}(y_i - \hat{y}_i)^2
$$

二乗平均平方誤差
$$
RMSE = \sqrt{\frac{1}{n} \sum_{i=1}^{n}(y_i - \hat{y}_i)^2}
 = \sqrt{MSE}
$$

R2（決定係数）
$$
R^2 = 1 - \frac{\sum_{i=1}^{n}(y_i - \hat{y}_i)^2}{\sum_{i=1}^{n}(y_i - \bar{y})^2}
= 1 - \frac{SS_{res}}{SS_{tot}}
$$

平均絶対誤差
$$
MAE = \frac{1}{n} \sum_{i=1}^{n}|y_i - \hat{y}_i|
$$

In [None]:
from random import seed

import sklearn.metrics as metrics

df_pred = df.copy()
# 乱数で誤差を加える
r = np.random.seed(42)  # 初期値固定
r = np.random.uniform(0, 10, df_pred["MEDV"].shape)
df_pred["MEDV"] = df_pred["MEDV"] + r

In [None]:
# MSEの計算
mse = metrics.mean_squared_error(df["MEDV"], df_pred["MEDV"])
mse

In [None]:
np.sqrt(mse)

In [None]:
# RMSEの計算
rmse = metrics.root_mean_squared_error(df["MEDV"], df_pred["MEDV"])
rmse

In [None]:
# R2の計算
r2 = metrics.r2_score(df["MEDV"], df_pred["MEDV"])
r2

In [None]:
# MAEの計算
mae = metrics.mean_absolute_error(df["MEDV"], df_pred["MEDV"])
mae