# Tools

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error

# (1) Load the houses prices dataset to work on

In [None]:
data = pd.read_csv('California_Houses.csv')

# (2) Inspect the dataset (Optional, to understand the data better)

In [None]:
print(data.head())
print(data.info())

   Median_House_Value  Median_Income  Median_Age  Tot_Rooms  Tot_Bedrooms  \
0            452600.0         8.3252          41        880           129   
1            358500.0         8.3014          21       7099          1106   
2            352100.0         7.2574          52       1467           190   
3            341300.0         5.6431          52       1274           235   
4            342200.0         3.8462          52       1627           280   

   Population  Households  Latitude  Longitude  Distance_to_coast  \
0         322         126     37.88    -122.23        9263.040773   
1        2401        1138     37.86    -122.22       10225.733072   
2         496         177     37.85    -122.24        8259.085109   
3         558         219     37.85    -122.25        7768.086571   
4         565         259     37.85    -122.25        7768.086571   

   Distance_to_LA  Distance_to_SanDiego  Distance_to_SanJose  \
0   556529.158342         735501.806984         67432.5170

# (3) Define the feature set (X) and the target variable (y)

In [None]:
X = data.drop(columns=['Median_House_Value'])
y = data['Median_House_Value']

# (4) Split the data into training, validation, and test sets

In [None]:
# Split into 85% train/validation and 15% test
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [6]:
# Then, split the remaining 85% into 70% train and 15% validation
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=3/17, random_state=42)

In [7]:
# Remove rows with missing values from X_train and y_train
X_train = X_train.dropna()
y_train = y_train[X_train.index]

In [8]:
print(f"Training set size: {X_train.shape}")
print(f"Validation set size: {X_val.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (14448, 13)
Validation set size: (3096, 13)
Test set size: (3096, 13)


# (5) Initialize the regression models

In [None]:
linear_model = LinearRegression() #has no hyperparameters, so it serves as a baseline.
lasso_model = Lasso(alpha=0.01) # alpha is a hyperparameter for regularization
ridge_model = Ridge(alpha=0.01) # alpha is a hyperparameter for regularization

# (6) Train each model on the training set

In [None]:
linear_model.fit(X_train, y_train)

In [11]:
lasso_model.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [12]:
ridge_model.fit(X_train, y_train)

# (7) Make predictions on the validation set

In [None]:
y_pred_linear_val = linear_model.predict(X_val)
y_pred_lasso_val = lasso_model.predict(X_val)
y_pred_ridge_val = ridge_model.predict(X_val)

# (8) Calculate error metrics for each model on the validation set

In [None]:
mse_linear_val = mean_squared_error(y_val, y_pred_linear_val)
mae_linear_val = mean_absolute_error(y_val, y_pred_linear_val)
mse_lasso_val = mean_squared_error(y_val, y_pred_lasso_val)
mae_lasso_val = mean_absolute_error(y_val, y_pred_lasso_val)
mse_ridge_val = mean_squared_error(y_val, y_pred_ridge_val)
mae_ridge_val = mean_absolute_error(y_val, y_pred_ridge_val)

In [15]:
print("Validation Set Performance:")
print(f"Linear Regression: MSE: {mse_linear_val}, MAE: {mae_linear_val}")
print(f"Lasso Regression:  MSE: {mse_lasso_val}, MAE: {mae_lasso_val}")
print(f"Ridge Regression:  MSE: {mse_ridge_val}, MAE: {mae_ridge_val}")

Validation Set Performance:
Linear Regression: MSE: 4603541834.653085, MAE: 49817.38625957957
Lasso Regression:  MSE: 4603780470.573258, MAE: 49819.689150626786
Ridge Regression:  MSE: 4603542677.845278, MAE: 49817.39502663655


# (9) Make predictions on the test set

In [None]:
y_pred_linear_test = linear_model.predict(X_test)
y_pred_lasso_test = lasso_model.predict(X_test)
y_pred_ridge_test = ridge_model.predict(X_test)

# (10) Calculate error metrics for each model on the test set

In [None]:
mse_linear_test = mean_squared_error(y_test, y_pred_linear_test)
mae_linear_test = mean_absolute_error(y_test, y_pred_linear_test)
mse_lasso_test = mean_squared_error(y_test, y_pred_lasso_test)
mae_lasso_test = mean_absolute_error(y_test, y_pred_lasso_test)
mse_ridge_test = mean_squared_error(y_test, y_pred_ridge_test)
mae_ridge_test = mean_absolute_error(y_test, y_pred_ridge_test)

In [18]:
print("\nTest Set Performance:")
print(f"Linear Regression -MSE: {mse_linear_test},  MAE: {mae_linear_test}")
print(f"Lasso Regression - MSE: {mse_lasso_test},  MAE: {mae_lasso_test}")
print(f"Ridge Regression - MSE: {mse_ridge_test},  MAE: {mae_ridge_test}")


Test Set Performance:
Linear Regression -MSE: 4857004593.678172,  MAE: 50743.9553421061
Lasso Regression - MSE: 4857157443.592012,  MAE: 50745.05974227646
Ridge Regression - MSE: 4857004802.419829,  MAE: 50743.962126493956


# (11) Analyze and Compare Results

In [None]:
print("\nModel Comparison:")
print(f"Linear Regression -Validation MSE: {mse_linear_val},  Test MSE: {mse_linear_test}")
print(f"Lasso Regression - Validation MSE: {mse_lasso_val},  Test MSE: {mse_lasso_test}")
print(f"Ridge Regression - Validation MSE: {mse_ridge_val},  Test MSE: {mse_ridge_test}")


Model Comparison:
Linear Regression -Validation MSE: 4603541834.653085,  Test MSE: 4857004593.678172
Lasso Regression - Validation MSE: 4603780470.573258,  Test MSE: 4857157443.592012
Ridge Regression - Validation MSE: 4603542677.845278,  Test MSE: 4857004802.419829
