In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


In [4]:
df = pd.read_csv('Housing.csv')

# Check for missing values and handle them if any
if df.isnull().values.any():
    df = df.dropna() #Drops any rows with missing values.

df = pd.get_dummies(df, drop_first=True)

# Ensure all features are numeric
for column in df.columns:
    if not pd.api.types.is_numeric_dtype(df[column]):
        df[column] = pd.to_numeric(df[column], errors='coerce')
        df = df.dropna(subset=[column])


X = df.drop(columns=['price'])
y = df['price']

In [5]:
# Scales the features to have a mean of 0 and a standard deviation of 1. 
# This ensures that all features contribute equally to the cost function and helps in faster convergence of gradient descent.
X = (X - X.mean()) / X.std()

# Adding a column of ones allows the model to include an intercept (bias) term, which adjusts the output regardless of the input values.
X = np.c_[np.ones(X.shape[0]), X]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
#Convert to numpy arrays to ensure compatibility with NumPy operations.
y_train = y_train.values
y_test = y_test.values

In [8]:
# Initialize weights
weights = np.zeros(X_train.shape[1])

In [14]:
# Set hyperparameters
learning_rate = 0.01 #Controls the step size in the direction of the gradient
num_iterations = 1000 #Specifies how many times the gradient descent algorithm will run to update the weights

In [15]:
# Gradient descent algorithm. Computes the predicted values using the current weights.
for i in range(num_iterations):
    predictions = X_train.dot(weights)
    errors = predictions - y_train
    gradient = X_train.T.dot(errors) / len(y_train)
    weights = weights - learning_rate * gradient

In [16]:
# Predicting
y_pred = X_test.dot(weights)

In [17]:
# # average square difference between predicted and actual values to evaluate performance
mean_squared_error = np.mean((y_test - y_pred) ** 2)
print(f'Mean Squared Error from scratch: {mean_squared_error}')

Mean Squared Error from scratch: 1754318687330.669


In [13]:
print("Predicted prices:", y_pred[:5])
print("Actual prices:", y_test[:5])

Predicted prices: [5164653.90033967 7224722.29802166 3109863.24240338 4612075.32722559
 3294646.25725956]
Actual prices: [4060000 6650000 3710000 6440000 2800000]
