# Regression Test
## Purpose: To analyze which regression model works the best on the given data
### Dataset: fetch_california_housing from sklearn library
### Predictor Variables: MedInc, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude
### Predicted Variable: MedHouseVal
### Algorithms: Linear Regression, Ridge Regression, Decision Tree Regression, Random Forest Regression

Import needed libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
california_data = fetch_california_housing()
cal_data = california_data["data"]
cal_target = california_data["target"]
cal_target_names = california_data["target_names"][0]
cal_descr = california_data["DESCR"]
cal_feature_names = california_data["feature_names"]

california_df = pd.DataFrame(cal_data)
california_df.columns = cal_feature_names
california_df[f"{cal_target_names}"] = cal_target
california_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


Check for null values

In [3]:
california_df.isna().sum()

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

Seperate X variables and y variable

In [4]:
X = california_df.drop(columns="MedHouseVal")
y = california_df["MedHouseVal"]

Split training and testing data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

Create functions to create model and get model metrics

In [6]:
def create_model(X_train_data, y_train_data, algorithm):
    model = algorithm
    model.fit(X_train_data, y_train_data)
    return model

def model_metrics(X_test_data, y_test_data, m):
    # RMSE, MAE, R2
    preds = m.predict(X_test_data)
    rmse = np.sqrt(mean_squared_error(preds, y_test_data))
    mae = mean_absolute_error(preds, y_test_data)
    r2 = r2_score(preds, y_test_data)
    metrics = [rmse, mae, r2]
    return metrics


Perform all regressors

In [7]:
# Linear Regression
lr = create_model(X_train, y_train, LinearRegression())
lr_metrics = model_metrics(X_test, y_test, lr)
print(f"Linear Regression\nRoot Mean Square Error:\t {lr_metrics[0]}")
print(f"Mean Absolute Error:\t {lr_metrics[1]}")
print(f"R-Squared:\t\t {lr_metrics[2]}")

Linear Regression
Root Mean Square Error:	 0.7255988445991857
Mean Absolute Error:	 0.531738848938267
R-Squared:		 0.3362717883615146


In [8]:
# Ridge Regression
ridge = create_model(X_train, y_train, Ridge())
ridge_metrics = model_metrics(X_test, y_test, ridge)
print(f"Ridge Regression\nRoot Mean Square Error:\t {ridge_metrics[0]}")
print(f"Mean Absolute Error:\t {ridge_metrics[1]}")
print(f"R-Squared:\t\t {ridge_metrics[2]}")

Ridge Regression
Root Mean Square Error:	 0.7256052278113256
Mean Absolute Error:	 0.5317498150502642
R-Squared:		 0.33617827770806397


In [9]:
# Decision Tree Regression
tree = create_model(X_train, y_train, DecisionTreeRegressor())
tree_metrics = model_metrics(X_test, y_test, tree)
print(f"Decision Tree Regression\nRoot Mean Square Error:\t {tree_metrics[0]}")
print(f"Mean Absolute Error:\t {tree_metrics[1]}")
print(f"R-Squared:\t\t {tree_metrics[2]}")

Decision Tree Regression
Root Mean Square Error:	 0.7202293530589429
Mean Absolute Error:	 0.45880207848837207
R-Squared:		 0.617537928049984


In [10]:
# Random Forest Regression
rf = create_model(X_train, y_train, RandomForestRegressor())
rf_metrics = model_metrics(X_test, y_test, rf)
print(f"Random Forest Regression\nRoot Mean Square Error:\t {rf_metrics[0]}")
print(f"Mean Absolute Error:\t {rf_metrics[1]}")
print(f"R-Squared:\t\t {rf_metrics[2]}")

Random Forest Regression
Root Mean Square Error:	 0.5067352528235012
Mean Absolute Error:	 0.33161477451550403
R-Squared:		 0.7538293767207424


Create dataframe to store algorithm metrics

In [11]:
df = pd.DataFrame({"Algorithm":["Linear Regression", "Ridge Regression", "Decision Tree Regression",
                           "Random Forest Regression"],
             "RMSE":[lr_metrics[0], ridge_metrics[0], tree_metrics[0], rf_metrics[0]],
                  "Mean Absolute Error":[lr_metrics[1], ridge_metrics[1], tree_metrics[1], rf_metrics[1]],
                  "R-Squared":[lr_metrics[2], ridge_metrics[2], tree_metrics[2], rf_metrics[2]]})


#### Sort by RMSE

In [12]:
df.sort_values(by="RMSE").reset_index(drop=True)

Unnamed: 0,Algorithm,RMSE,Mean Absolute Error,R-Squared
0,Random Forest Regression,0.506735,0.331615,0.753829
1,Decision Tree Regression,0.720229,0.458802,0.617538
2,Linear Regression,0.725599,0.531739,0.336272
3,Ridge Regression,0.725605,0.53175,0.336178


## Conclusion

The regression model that predicted best on the California Housing dataset was the Random Forest Regression model.