In [None]:
from sklearn.datasets import load_boston
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = boston.target

# Data Check

In [None]:
X.head()

In [None]:
def draw_histogram(x, title=None, xlabel=None):
    plt.title(title)
    plt.hist(x, color='blue', rwidth=0.9)
    plt.xlabel(xlabel)
    plt.show()

In [None]:
def draw_scatters(X, y, ncols=3):
    cols = X.columns
    fig, axs = plt.subplots(nrows=-(-len(cols)//3), ncols=ncols, figsize=(len(cols), len(cols)*1.5))
    fig.suptitle('relationship between y and X')
    
    for ax, col in zip(axs.ravel(), cols):
        ax.set_xlabel(col)
        ax.set_ylabel('y')
        ax.scatter(x=X[col], y=y, marker="o", color="brown")
    
    fig.show()

In [None]:
draw_histogram(y, title='y distribution', xlabel="MEDV")

In [None]:
draw_scatters(X, y)

In [None]:
sns.pairplot(X)
plt.show()

# Interpretating by Coef

In [None]:
TEST_SIZE = 0.2
RANDOM_SEED = 2020
N_FOLD = 5

## without KFold

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

lr = LinearRegression()
lr.fit(X_train_std, y_train)

y_pred = lr.predict(X_test_std)

In [None]:
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)}")
print(f"R^2: {r2_score(y_test, y_pred)}")

In [None]:
df_coef = pd.DataFrame({"columns": X.columns, "coef": lr.coef_.tolist()}).sort_values("coef", ascending=False)
plt.barh(width=list(df_coef["coef"]), y=df_coef["columns"])
plt.title(f"coef")
plt.show()

## with KFold

In [None]:
folds = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
fold_iter = folds.split(X, y=y)
height = 0.15
labels = X.columns
left = np.arange(len(labels), dtype=float) - height

for n_fold, (trn_idx, val_idx) in enumerate(fold_iter):
    X_train, X_valid = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_valid = y[trn_idx], y[val_idx]
    
    sc = StandardScaler()
    X_train_std = sc.fit_transform(X_train)
    X_valid_std = sc.transform(X_valid)

    lr = LinearRegression()
    lr.fit(X_train_std, y_train)

    y_pred = lr.predict(X_valid_std)
    
    bar = plt.barh(left, lr.coef_.tolist(), height=height, align='center', label=f"fold-{n_fold}")
    plt.bar_label(bar, label_type='center', fmt='')
    plt.legend()
    plt.title("coef")
    plt.yticks(left-(height*N_FOLD)/2, labels)

    left += height

plt.show()