Import and load our dataset using SKlearn

In [1]:
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()

In [26]:
import pandas as pd

df = pd.DataFrame(data.data, columns=data.feature_names)
df['MedHouseValue'] = data.target
# print(data)
print(df)

# df.head()
# df.info()
# df.describe()


       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
...       ...       ...       ...        ...         ...       ...       ...   
20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   

       Longitude  MedHouseValue  
0    

Preprocessing data

Splitting Data

In [16]:
from sklearn.model_selection import train_test_split

X = df.drop('MedHouseValue', axis=1)
y = df['MedHouseValue']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=56
)

Scaling features for Outliers

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Training Models using Linear Regression, Lasso and Ridge

In [19]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [20]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_scaled, y_train)


0,1,2
,alpha,0.1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [22]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


Model Evaluation using R**2

In [23]:
from sklearn.metrics import r2_score

pred_lr = lr.predict(X_test_scaled)
pred_ridge = ridge.predict(X_test_scaled)
pred_lasso = lasso.predict(X_test_scaled)

print("Linear R2:", r2_score(y_test, pred_lr))
print("Ridge R2:", r2_score(y_test, pred_ridge))
print("Lasso R2:", r2_score(y_test, pred_lasso))

Linear R2: 0.5940189881270831
Ridge R2: 0.5940159407928318
Lasso R2: 0.4786542778096269


Getting Best Model between Ridge and LR using MSE and MAE

In [24]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mse_lr = mean_squared_error(y_test, pred_lr)
mae_lr = mean_absolute_error(y_test, pred_lr)

mse_ridge = mean_squared_error(y_test, pred_ridge)
mae_ridge = mean_absolute_error(y_test, pred_ridge)

print("Linear MSE:", mse_lr, "MAE:", mae_lr)
print("Ridge MSE:", mse_ridge, "MAE:", mae_ridge)

Linear MSE: 0.5485700675527098 MAE: 0.5427671504483115
Ridge MSE: 0.5485741851747343 MAE: 0.5427625495441998


Saving Best Model

In [25]:
import joblib
joblib.dump(lr, "model/lr_model.pkl")
joblib.dump(scaler, "model/scaler.pkl")

['model/scaler.pkl']