In [2]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-Learn's Linear Regression

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, KFold

In [4]:
housing = pl.read_csv("Housing.csv")
housing_int = housing.clone().select(pl.col(pl.Int64))
housing_int.head()

price,area,bedrooms,bathrooms,stories,parking
i64,i64,i64,i64,i64,i64
13300000,7420,4,2,3,2
12250000,8960,4,4,4,3
12250000,9960,3,2,2,2
12215000,7500,4,2,2,3
11410000,7420,4,1,2,2


In [5]:
# Splitting data
x = housing_int[:, 1:].to_numpy()
y = housing_int[:, 0].to_numpy().reshape(-1, 1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)

In [6]:
# Standardisation
x_scaler = StandardScaler()
x_train_scaled = x_scaler.fit_transform(x_train)
x_test_scaled = x_scaler.transform(x_test)

y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [7]:
# Linear regression model
lin_reg = LinearRegression()
lin_reg.fit(x_train_scaled, y_train_scaled)
lin_reg_coefs = lin_reg.coef_[0].round(3)
lin_reg_intercept = lin_reg.intercept_

In [22]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

cv_results = cross_validate(
    lin_reg, 
    x_train_scaled, y_train_scaled,
    scoring="neg_mean_squared_error", 
    cv=cv,
    return_estimator=True
)

# Identify which fold produced the best score
best_fold_idx = cv_results['test_score'].round(3).argmax()

# Get the fitted model corresponding to the best fold
best_estimator = cv_results['estimator'][best_fold_idx]

# Extract the results
coefficients = best_estimator.coef_[0]
intercept = best_estimator.intercept_[0]
neg_mse = cv_results['test_score'][best_fold_idx]

# Intercept
intercept_reshaped = lin_reg_intercept.reshape(1, -1)
intercept_inverse = y_scaler.inverse_transform(intercept_reshaped)[0, 0].round(2)

# Coefficients
coefficients_reshaped = lin_reg_coefs.reshape(1, -1)
coefficients_inverse = x_scaler.inverse_transform(coefficients_reshaped)[0].round(3)

## Final Results
print("### Best Fold Results ###")
print(f"Score (Negative MSE): {neg_mse.round(3)}")
print(f"Intercept: {intercept_inverse.round(3):,}")
print(f"Coefficients: {coefficients_inverse.round(3)}")

### Best Fold Results ###
Score (Negative MSE): -0.356
Intercept: 4,783,460.64
Coefficients: [6.04245e+03 3.01600e+00 1.47800e+00 2.02400e+00 8.45000e-01]
