## 11.1 リッジ回帰

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
%matplotlib inline
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('data/Boston.csv')
df = df.fillna(df.loc[:, 'ZN':].mean())  # 欠損値を平均値で埋める
df = df.drop([76], axis=0)  # 76番目の外れ値を削除

t = df[['PRICE']]  # 正解データの抜き出し
x = df.loc[:, ['RM', 'LSTAT', 'PTRATIO']]  # 特徴量の抜き出し

# 標準化
sc = StandardScaler()
sc_x = sc.fit_transform(x)
sc2 = StandardScaler()
sc_t = sc2.fit_transform(t)

In [27]:
from sklearn.preprocessing import PolynomialFeatures

pf = PolynomialFeatures(degree=2, include_bias=False)
pf_x = pf.fit_transform(sc_x)  # 二乗列と交互作用特徴量の追加
pf_x.shape

(99, 9)

In [29]:
pf.get_feature_names_out()

array(['x0', 'x1', 'x2', 'x0^2', 'x0 x1', 'x0 x2', 'x1^2', 'x1 x2',
       'x2^2'], dtype=object)

In [30]:
from sklearn.linear_model import LinearRegression

x_train, x_test, y_train, y_test = train_test_split(pf_x, sc_t, test_size=0.3, random_state=0)
model = LinearRegression()
model.fit(x_train, y_train)

print(model.score(x_train, y_train))  # 学習データに対する決定係数
model.score(x_test, y_test)  # テストデータに対する決定係数

0.8710525685992707


0.7854929935582586

In [32]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=10)
ridge_model.fit(x_train, y_train)
print(ridge_model.score(x_train, y_train))
print(ridge_model.score(x_test, y_test))


0.8607320524729507
0.8458730019328174


In [33]:
max_score = 0
max_index = 0

for i in range(1, 2001):
    num = i / 100
    ridge_model = Ridge(random_state=0, alpha=num)
    ridge_model.fit(x_train, y_train)

    result = ridge_model.score(x_test, y_test)
    if result > max_score:
        max_score = result
        max_index = num

print(max_index, max_score)


17.62 0.8528754801497631


In [34]:
print(sum(abs(model.coef_)[0]))  # 線形回帰の係数（絶対値）

print(sum(abs(ridge_model.coef_)[0]))  # リッジ回帰の合計

1.5566745983288368
1.2152800824026817
