In [None]:
!lscpu

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import graphviz
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
    header=None,
    sep="\s+",
)
df.columns = [
    "CRIM",
    "ZN",
    "INDUS",
    "CHAS",
    "NOX",
    "RM",
    "AGE",
    "DIS",
    "RAD",
    "TAX",
    "PTRATIO",
    "B",
    "LSTAT",
    "MEDV",
]
df.head()

In [None]:
X_train = df.loc[:99, ["RM"]]
y_train = df.loc[:99, "MEDV"]
print("X_train:", X_train[:3])
print("y_train:", y_train[:3])

In [None]:
# 回帰木
# モデルの学習
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(
    criterion="squared_error", max_depth=1, min_samples_leaf=1, random_state=0
)  # 回帰木モデルの定義
model.fit(X_train, y_train)
model.get_params()

In [None]:
# 予測値
model.predict(X_train)

In [None]:
# 木の可視化
import pydot
from IPython import display
from sklearn import tree

dot_data = tree.export_graphviz(
    model, out_file=None, rounded=True, feature_names=["RM"], filled=True
)
# graphviz.Source(dot_data, format="png")
# この部分を以下のように変更
graphs = pydot.graph_from_dot_data(dot_data)
png_data = graphs[0].create_png(prog="dot")
display.display(display.Image(png_data))

In [None]:
# データと予測値の可視化
plt.figure(figsize=(8, 4))
X = X_train.values.flatten()
y = y_train.values

X_plt = np.arange(X.min(), X.max(), 0.01)[:, np.newaxis]
y_pred = model.predict(X_plt)

plt.scatter(X, y, color="blue", label="data")
plt.plot(X_plt, y_pred, color="red", label="LinearRegression")
plt.ylabel("Price in $1000s [MDEV]")
plt.xlabel("average number of rooms [RM]")
plt.title("Boston house-prices")
plt.legend(loc="upper right")
plt.show()

In [None]:
# 深さ2の回帰木

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error

In [None]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
    header=None,
    sep="\s+",
)
df.columns = [
    "CRIM",
    "ZN",
    "INDUS",
    "CHAS",
    "NOX",
    "RM",
    "AGE",
    "DIS",
    "RAD",
    "TAX",
    "PTRATIO",
    "B",
    "LSTAT",
    "MEDV",
]
df.head()

In [None]:
X_train = df.loc[:99, ["RM"]]
y_train = df.loc[:99, "MEDV"]
print("X_train:", X_train[:3])
print("y_train:", y_train[:3])

In [None]:
# 回帰木
# モデルの学習
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(
    criterion="squared_error",
    max_depth=2,
    min_samples_leaf=1,
    ccp_alpha=0,
    random_state=0,
)  # 回帰木モデルの定義
model.fit(X_train, y_train)
model.get_params()

In [None]:
model.predict(X_train)

In [None]:
# 木の可視化
import pydot
from IPython import display
from sklearn import tree

dot_data = tree.export_graphviz(
    model, out_file=None, rounded=True, feature_names=["RM"], filled=True
)
# graphviz.Source(dot_data, format="png")
# この部分を以下のように変更
graphs = pydot.graph_from_dot_data(dot_data)
png_data = graphs[0].create_png(prog="dot")
display.display(display.Image(png_data))

In [None]:
# データと予測値の可視化
plt.figure(figsize=(8, 4))
X = X_train.values.flatten()
y = y_train.values

X_plt = np.arange(X.min(), X.max(), 0.01)[:, np.newaxis]
y_pred = model.predict(X_plt)

plt.scatter(X, y, color="blue", label="data")
plt.plot(X_plt, y_pred, color="red", label="LinearRegression")
plt.ylabel("Price in $1000s [MDEV]")
plt.xlabel("average number of rooms [RM]")
plt.title("Boston house-prices")
plt.legend(loc="upper right")
plt.show()

In [None]:
# 回帰木の正則化

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error

In [None]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
    header=None,
    sep="\s+",
)
df.columns = [
    "CRIM",
    "ZN",
    "INDUS",
    "CHAS",
    "NOX",
    "RM",
    "AGE",
    "DIS",
    "RAD",
    "TAX",
    "PTRATIO",
    "B",
    "LSTAT",
    "MEDV",
]
df.head()

In [None]:
X = df.drop(["MEDV"], axis=1)
y = df["MEDV"]
X.head()

In [None]:
# 学習データとテストデータに分割
from numpy import test

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=0
)
print("X_trainの形状:", X_train.shape)
print("y_trainの形状:", y_train.shape)
print("X_testの形状:", X_test.shape)
print("y_testの形状:", y_test.shape)

In [None]:
# 回帰木
# モデルの学習
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(
    criterion="squared_error",
    max_depth=4,
    min_samples_leaf=10,
    ccp_alpha=5,
    random_state=0,
)  # 回帰木モデルの定義
model.fit(X_train, y_train)
model.get_params()

In [None]:
# テストデータでの予測と評価
y_test_pred = model.predict(X_test)
print("RMSE test: %.2f" % root_mean_squared_error(y_test, y_test_pred))

In [None]:
# 予測値
model.predict(X_test)

In [None]:
# 木の可視化
import pydot
from IPython import display
from sklearn import tree

dot_data = tree.export_graphviz(
    model, out_file=None, rounded=True, feature_names=X.columns, filled=True
)
# graphviz.Source(dot_data, format="png")
# この部分を以下のように変更
graphs = pydot.graph_from_dot_data(dot_data)
png_data = graphs[0].create_png(prog="dot")
display.display(display.Image(png_data))

In [None]:
# 特徴量の重要度の可視化
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(8, 4))
plt.title("Feature importances")
plt.bar(range(len(indices)), importances[indices])
plt.xticks(range(len(indices)), X.columns[indices], rotation=90)
plt.show()