In [8]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
def cal_r2_score(y_true, y_pred):
    y_bar = np.mean(y_true)
    ss_total = np.sum((y_true - y_bar) ** 2)
    ss_explained = np.sum((y_pred - y_bar) ** 2)
    ss_residual = np.sum((y_true - y_pred) ** 2)
    sklearn_r2 = r2_score(y_true, y_pred)

    print(f'R_square 1 - (SS_residual / SS_total) = {1 - ss_residual/ss_total}')
    print(f'R-square sklearn {sklearn_r2}')


X = load_boston()['data'].copy()
y = load_boston()['target'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
cal_r2_score(y_test, predictions)

In [24]:
from sklearn.feature_selection import mutual_info_regression

df = pd.DataFrame(load_boston()['data'].copy(), columns=load_boston()['feature_names'])
df['y'] = load_boston()['target'].copy()

df['RAD'] = df['RAD'].astype(int)
df['CHAS'] = df['CHAS'].astype(int)

X = df.drop(columns='y').copy()
y = df['y'].copy()

X['random1'] = np.random.randn(len(X))
X['random2'] = np.random.randn(len(X))
X['random3'] = np.random.randn(len(X))

mutual_info = mutual_info_regression(X, y, discrete_features=X.dtypes == np.int32)
mutual_info = pd.DataFrame(mutual_info.reshape(1, -1), columns=X.columns)
print(mutual_info.iloc[:, 8:])

        RAD      TAX   PTRATIO         B     LSTAT  random1  random2  random3
0  0.205826  0.36495  0.436795  0.167905  0.667148      0.0      0.0      0.0


In [26]:
mutual_info = mutual_info_regression(X, y, discrete_features=X.dtypes == np.int32)
mutual_info = pd.Series(mutual_info, index=X.columns)
result_df = pd.DataFrame()

for i in range(1, len(mutual_info) + 1):
    X_new = X.iloc[:, :i].copy()
    linear_regression = LinearRegression()
    linear_regression.fit(X_new, y)

    prediction = linear_regression.predict(X_new)
    r2 = r2_score(y_true=y, y_pred=prediction)
    adj_r2 = 1 - ((1 - r2) * (len(X) - 1) / (len(X) - i - 1))

    result_df = result_df.append(pd.DataFrame({'r2': r2,
                                               'adj_r2': adj_r2}, index=[i]))

result_df.iloc[10: ]

Unnamed: 0,r2,adj_r2
11,0.670314,0.662973
12,0.684204,0.676518
13,0.740643,0.73379
14,0.740868,0.733479
15,0.743964,0.736127
16,0.743965,0.735587
