In [1]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
state = 4

# Scikit-Learn's Linear Regression

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_validate, KFold
from sklearn.metrics import mean_squared_error

In [4]:
housing = pl.read_csv("Housing.csv")
housing.head()

price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
i64,i64,i64,i64,i64,str,str,str,str,str,i64,str,str
13300000,7420,4,2,3,"""yes""","""no""","""no""","""no""","""yes""",2,"""yes""","""furnished"""
12250000,8960,4,4,4,"""yes""","""no""","""no""","""no""","""yes""",3,"""no""","""furnished"""
12250000,9960,3,2,2,"""yes""","""no""","""yes""","""no""","""no""",2,"""yes""","""semi-furnished"""
12215000,7500,4,2,2,"""yes""","""no""","""yes""","""no""","""yes""",3,"""yes""","""furnished"""
11410000,7420,4,1,2,"""yes""","""yes""","""yes""","""no""","""yes""",2,"""no""","""furnished"""


In [5]:
categorical_cols = housing.select(pl.col(pl.String)).columns

encoder = OneHotEncoder()
encoded_array = encoder.fit_transform(housing[categorical_cols]).toarray().astype('int64')
encoder_features = encoder.get_feature_names_out().tolist()

housing_cat = pl.DataFrame(
    encoded_array,
    schema=encoder_features
).with_row_index(name="index")

housing_cat.head()

index,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,basement_no,basement_yes,hotwaterheating_no,hotwaterheating_yes,airconditioning_no,airconditioning_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
u32,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
0,0,1,1,0,1,0,1,0,0,1,0,1,1,0,0
1,0,1,1,0,1,0,1,0,0,1,1,0,1,0,0
2,0,1,1,0,0,1,1,0,1,0,0,1,0,1,0
3,0,1,1,0,0,1,1,0,0,1,0,1,1,0,0
4,0,1,0,1,0,1,1,0,0,1,1,0,1,0,0


In [6]:
housing_int = housing.clone().select(pl.col(pl.Int64)).with_row_index(name="index")
housing_int.head()

index,price,area,bedrooms,bathrooms,stories,parking
u32,i64,i64,i64,i64,i64,i64
0,13300000,7420,4,2,3,2
1,12250000,8960,4,4,4,3
2,12250000,9960,3,2,2,2
3,12215000,7500,4,2,2,3
4,11410000,7420,4,1,2,2


In [7]:
housing_df = housing_int.join(
    other=housing_cat,
    on="index", 
    how="left" 
)

In [8]:
# Splitting data
x = housing_df[:, 2:].to_numpy()
y = housing_df[:, 0].to_numpy().reshape(-1, 1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=state)

In [9]:
# Standardisation
x_scaler = StandardScaler()
x_train_scaled = x_scaler.fit_transform(x_train)
x_test_scaled = x_scaler.transform(x_test)

y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [10]:
# Linear regression model
lin_reg = LinearRegression()
lin_reg.fit(x_train_scaled, y_train_scaled)

# Cross validation
cv = KFold(n_splits=5, shuffle=True, random_state=state)

# Applying the linear regression model on each fold
cv_results = cross_validate(
    lin_reg, 
    x_train_scaled, y_train_scaled,
    scoring="neg_mean_squared_error", 
    cv=cv,
    return_estimator=True
)

# Identify which fold produced the best score
best_fold_idx = cv_results['test_score'].round(3).argmax()

# Get the fitted model corresponding to the best fold
best_estimator = cv_results['estimator'][best_fold_idx]

# Extract the results
coefficients_scaled = best_estimator.coef_[0]
intercept_scaled = best_estimator.intercept_[0]
mse = -(cv_results['test_score'][best_fold_idx])

# Intercept
intercept_reshaped = intercept_scaled.reshape(1, -1)
intercept = y_scaler.inverse_transform(intercept_reshaped)[0, 0].round(2)

# Coefficients
coefficients_reshaped = coefficients_scaled.reshape(1, -1)
coefficients = x_scaler.inverse_transform(coefficients_reshaped)[0].round(3)

## Final Results
np.set_printoptions(suppress=True, precision=3)

print("### Best Fold Results ###")
print(f"Score (MSE): {mse.round(3)}")
print(f"Intercept: {intercept.round(3):,}")
print(f"Coefficients: {coefficients.round(2)}")

### Best Fold Results ###
Score (MSE): 0.254
Intercept: 269.75
Coefficients: [4554.13    2.92    1.2     1.58    0.55    0.15    0.85    0.84    0.16
    0.7     0.3     0.96    0.04    0.72    0.28    0.79    0.21    0.24
    0.36    0.39]


In [11]:
y_pred_test = best_estimator.predict(x_test_scaled)

final_mse = mean_squared_error(y_test_scaled, y_pred_test)
final_r2_score = best_estimator.score(x_test_scaled, y_test_scaled)

print(f"Final test MSE: {final_mse:.4f}")
print(f"Final test R-squared score: {final_r2_score:.4f}")

Final test MSE: 0.3492
Final test R-squared score: 0.6392
