In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# Fetching the DataSet
# california_housing = fetch_california_housing(data_home = 'DataSet', download_if_missing = True, as_frame = True)
df = pd.read_csv('DataSet/housing.csv')

In [3]:
df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [4]:
df.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [5]:
df['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [6]:
# Converting object datatype to int | <1H OCEAN - 0 | INLAND - 1 | NEAR OCEAN - 4 | NEAR BAY - 3 | ISLAND - 2
df['ocean_proximity'] = df['ocean_proximity'].astype('category')
df['ocean_proximity'] = df['ocean_proximity'].cat.codes

In [7]:
# Filling NaN in 'total_bedrooms' with mean
df['total_bedrooms'] = df['total_bedrooms'].fillna(np.mean(df['total_bedrooms']))

In [8]:
# Z-Score Normalization | Feature Scaling
for index in range(df.shape[1]) :
    df.iloc[:, index] = (df.iloc[:, index] - np.mean(df.iloc[:, index]))/np.std(df.iloc[:, index])

In [9]:
# Splitting Test and Train DataSet
y = df['median_house_value']
X = df.drop('median_house_value', axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [10]:
# Initialize the Weights to Zero
W = np.zeros(X_train.shape[1])
a = 0
# Defining GD
for i in range(X_train.shape[1]) :
    for j in range(X_train.shape[0]) :
        W[i] = W[i] - (a/X_train.shape[0])(np.sum(np.dot(X[j][:], W.T) - y_train[j]))

KeyError: 0

In [11]:
class LinearRegression :
    def __init__(self, lr, itr) :
        self.lr  = lr
        self.itr = itr
        self.W   = 0     # New Variables W and B are initialized which are Weights and Bias respectively
        self.B   = 0

    def model(self, X, y): # Applied GD on MSE
        self.W = np.zeros(X.shape[1])
        self.B = 0

        for index in range(self.itr) :
            H = np.dot(X, self.W) + self.B  # Hypothesis

            dW = (1/X.shape[0])*(np.dot(X.T, (H - y)))
            dB = (1/X.shape[0])*(np.sum(H - y))

            self.W = self.W - self.lr*dW
            self.B = self.B - self.lr*dB

    def predict(self, X):
        y = np.dot(X, self.W) + self.B
        return np.array(y)

In [12]:
linreg = LinearRegression(lr = 0.001, itr = 10000)
linreg.model(X_train, y_train)
y_pred = linreg.predict(X_test)

In [13]:
print(mean_squared_error(y_test, y_pred))

0.3947437413869772
