In [5]:
import numpy as np
import pandas as pd
from google.colab import files
import io
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn import preprocessing
import matplotlib.pyplot as plt
import copy, math
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler

**Linear Regression** - housing dataset

In [19]:
df = pd.read_csv('housing.csv')
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [20]:
df = df.dropna()
df = df.drop(columns=['ocean_proximity'])
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0
...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0


In [32]:
X = df.iloc[:,:-1].values
print(f'X shape: {X.shape}, type: {type(X)} \n')
X

X shape: (20433, 8), type: <class 'numpy.ndarray'> 



array([[-1.2223e+02,  3.7880e+01,  4.1000e+01, ...,  3.2200e+02,
         1.2600e+02,  8.3252e+00],
       [-1.2222e+02,  3.7860e+01,  2.1000e+01, ...,  2.4010e+03,
         1.1380e+03,  8.3014e+00],
       [-1.2224e+02,  3.7850e+01,  5.2000e+01, ...,  4.9600e+02,
         1.7700e+02,  7.2574e+00],
       ...,
       [-1.2122e+02,  3.9430e+01,  1.7000e+01, ...,  1.0070e+03,
         4.3300e+02,  1.7000e+00],
       [-1.2132e+02,  3.9430e+01,  1.8000e+01, ...,  7.4100e+02,
         3.4900e+02,  1.8672e+00],
       [-1.2124e+02,  3.9370e+01,  1.6000e+01, ...,  1.3870e+03,
         5.3000e+02,  2.3886e+00]])

In [33]:
y = df['median_house_value'].values
print(f'y shape: {y.shape}, type: {type(y)} \n')
y

y shape: (20433,), type: <class 'numpy.ndarray'> 



array([452600., 358500., 352100., ...,  92300.,  84700.,  89400.])

In [34]:
scaler_housing = StandardScaler()

In [36]:
scaler_housing.fit(X)
X_scaled = scaler_housing.transform(X)

print(f'y shape: {X_scaled.shape}, type: {type(X_scaled)} \n')
X_scaled

y shape: (20433, 8), type: <class 'numpy.ndarray'> 



array([[-1.32731375,  1.05171726,  0.98216331, ..., -0.97331952,
        -0.97683327,  2.34516291],
       [-1.32232256,  1.04235526, -0.60621017, ...,  0.86133924,
         1.67037262,  2.33263161],
       [-1.33230494,  1.03767426,  1.85576873, ..., -0.81976943,
        -0.84342665,  1.78293943],
       ...,
       [-0.82320322,  1.77727236, -0.92388486, ..., -0.36882637,
        -0.17377773, -1.14317103],
       [-0.87311515,  1.77727236, -0.84446619, ..., -0.60356386,
        -0.39350628, -1.05513604],
       [-0.83318561,  1.74918635, -1.00330353, ..., -0.03348711,
         0.07995643, -0.78060586]])

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33, random_state=42)

In [38]:
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)

weights = linear_reg_model.coef_
bias = linear_reg_model.intercept_
print(f'best weights: {weights} \nbest bias: {bias}')

best weights: [-84482.72708518 -90031.32308528  15045.48100293 -18882.37580669
  50028.17358745 -40418.24722368  14324.62358638  76268.79687262] 
best bias: 207247.8682009606


In [40]:
y_pred = linear_reg_model.predict(X_test)

for i in range(100):
  print(f'Label: {y_test[i]:<10} | Prediction: {y_pred[i]:<10.4f}')

Label: 245800.0   | Prediction: 197402.5866
Label: 137900.0   | Prediction: 157886.4812
Label: 218200.0   | Prediction: 203530.7206
Label: 220800.0   | Prediction: 173710.9116
Label: 170500.0   | Prediction: 212870.3842
Label: 75000.0    | Prediction: 108986.3297
Label: 269400.0   | Prediction: 284528.1070
Label: 228900.0   | Prediction: 250017.3984
Label: 500001.0   | Prediction: 390640.7140
Label: 80800.0    | Prediction: 49676.1297
Label: 170000.0   | Prediction: -21720.0045
Label: 87800.0    | Prediction: 126569.8294
Label: 242200.0   | Prediction: 245885.4028
Label: 165500.0   | Prediction: 194263.3477
Label: 164000.0   | Prediction: 62734.4242
Label: 238000.0   | Prediction: 179195.4794
Label: 283300.0   | Prediction: 217474.8284
Label: 249400.0   | Prediction: 283949.3831
Label: 143600.0   | Prediction: 219600.5682
Label: 333100.0   | Prediction: 277013.9606
Label: 164900.0   | Prediction: 231660.7655
Label: 216100.0   | Prediction: 162372.1187
Label: 188700.0   | Prediction: 21

In [41]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred, squared=True)

# R-squared (R²)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R²):", r2)

Mean Absolute Error (MAE): 50476.398238187496
Mean Squared Error (MSE): 4675459167.725828
Root Mean Squared Error (RMSE): 4675459167.725828
R-squared (R²): 0.6473837161332721


**Compare to Normal Equation** <br>

Mean Absolute Error (MAE): 50476.39823807128 <br>
Mean Squared Error (MSE): 4675459167.725623 <br>
Root Mean Squared Error (RMSE): 4675459167.725623 <br>
R-squared (R²): 0.6473837161332876