In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [15]:
data = pd.read_csv('house_price_regression_dataset.csv')
data.head()

Unnamed: 0,Square_Footage,Num_Bedrooms,Num_Bathrooms,Year_Built,Lot_Size,Garage_Size,Neighborhood_Quality,House_Price
0,1360,2,1,1981,0.599637,0,5,262382.9
1,4272,3,3,2016,4.753014,1,6,985260.9
2,3592,1,2,2016,3.634823,0,9,777977.4
3,966,1,2,1977,2.730667,1,8,229698.9
4,4926,2,1,1993,4.699073,0,8,1041741.0


In [16]:
data.isnull().sum()

Square_Footage          0
Num_Bedrooms            0
Num_Bathrooms           0
Year_Built              0
Lot_Size                0
Garage_Size             0
Neighborhood_Quality    0
House_Price             0
dtype: int64

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Square_Footage        1000 non-null   int64  
 1   Num_Bedrooms          1000 non-null   int64  
 2   Num_Bathrooms         1000 non-null   int64  
 3   Year_Built            1000 non-null   int64  
 4   Lot_Size              1000 non-null   float64
 5   Garage_Size           1000 non-null   int64  
 6   Neighborhood_Quality  1000 non-null   int64  
 7   House_Price           1000 non-null   float64
dtypes: float64(2), int64(6)
memory usage: 62.6 KB


# From Scratch

In [18]:
train_data = data.loc[:801, :]
X_train = train_data.drop(["House_Price"], axis=1)
y_train = train_data["House_Price"]

test_data = data.loc[801:, :]
X_test = test_data.drop(["House_Price"], axis=1)
y_test = test_data["House_Price"]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = np.c_[np.ones(X_train_scaled.shape[0]), X_train_scaled]
X_test_scaled = np.c_[np.ones(X_test_scaled.shape[0]), X_test_scaled]

In [19]:
class LinearRegressions:
    def __init__(self, alpha=0.01, num_iter=1500):
        self.alpha = alpha
        self.num_iter = num_iter
        self.theta = None

    def predict(self, X):
        predictions = []
        for i in range(len(X)):
            prediction = 0
            for j in range(len(self.theta)):
                prediction += X[i][j] * self.theta[j]
            predictions.append(prediction)
        return predictions

    def cost(self, X, y):
        m = len(y)
        predictions = self.predict(X)
        cost = 0
        for i in range(m):
            cost += (predictions[i] - y[i]) ** 2
        return (1 / (2 * m)) * cost

    def gradient(self, X, y):
        m = len(y)
        predictions = self.predict(X)
        error = [predictions[i] - y[i] for i in range(m)]
        gradient = [0] * len(self.theta)

        for j in range(len(self.theta)):
            for i in range(m):
                gradient[j] += X[i][j] * error[i]
            gradient[j] /= m
        return gradient

    def gradient_descent(self, X, y):
        m = len(y)
        cost_history = []
        self.theta = [0] * len(X[0]) 

        for i in range(self.num_iter):
            grad = self.gradient(X, y)
            for j in range(len(self.theta)):
                self.theta[j] -= self.alpha * grad[j]
            cost = self.cost(X, y)
            cost_history.append(cost)

        return self.theta, cost_history

model = LinearRegressions(alpha=0.01, num_iter=1500)
theta, cost_history = model.gradient_descent(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

y_test_list = list(y_test)
mae = sum(abs(y_pred[i] - y_test_list[i]) for i in range(len(y_test_list))) / len(y_test_list)
mse = sum((y_pred[i] - y_test_list[i]) ** 2 for i in range(len(y_test_list))) / len(y_test_list)
mean_y = sum(y_test_list) / len(y_test_list)
ss_tot = sum((y - mean_y) ** 2 for y in y_test_list)
ss_res = sum((y_pred[i] - y_test_list[i]) ** 2 for i in range(len(y_test_list)))
r2_score_ = 1 - (ss_res / ss_tot)

print("Final Cost:", cost_history[-1])
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2 Score:", r2_score_)

Final Cost: 48636159.445640154
Mean Absolute Error: 7557.139021052108
Mean Squared Error: 88214795.62829228
R2 Score: 0.9985622064753631


# Using sklearn

In [21]:
X_sklearn = data.drop(["House_Price"], axis=1)
y_sklearn = data["House_Price"]

X_train_sklearn, X_test_sklearn, y_train_sklearn, y_test_sklearn = train_test_split(X_sklearn, y_sklearn, test_size=0.2, random_state=42)

scaler_sklearn = StandardScaler()
X_train_scaled_sklearn = scaler_sklearn.fit_transform(X_train_sklearn)
X_test_scaled_sklearn = scaler_sklearn.transform(X_test_sklearn)

lin_reg_sklearn = LinearRegression()
lin_reg_sklearn.fit(X_train_scaled_sklearn, y_train_sklearn)
y_pred_sklearn = lin_reg_sklearn.predict(X_test_scaled_sklearn)

r2 = r2_score(y_test_sklearn, y_pred_sklearn)
mae = mean_absolute_error(y_test_sklearn, y_pred_sklearn)
mse = mean_squared_error(y_test_sklearn, y_pred_sklearn)

print("Sklearn Model - R2 Score:", r2)
print("Sklearn Model - MAE:", mae)
print("Sklearn Model - MSE:", mse)

scores_sklearn = cross_val_score(lin_reg_sklearn, X_train_scaled_sklearn, y_train_sklearn, cv=5, scoring="r2")
print("Cross-Validation R2 Scores:", scores_sklearn)
print("Average R2 Score:", scores_sklearn.mean())

Sklearn Model - R2 Score: 0.9984263636823413
Sklearn Model - MAE: 8174.583600006653
Sklearn Model - MSE: 101434798.50563647
Sklearn Model - Score: 0.9984263636823413
Cross-Validation R2 Scores: [0.99839569 0.99869845 0.99860403 0.99845366 0.99843125]
Average R2 Score: 0.9985166137865891


# Plots

In [None]:
plt.plot(cost_history)
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.title('Cost History')
plt.show()

In [None]:
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', lw=2)  # Ideal line
plt.xlabel('Actual House Prices')
plt.ylabel('Predicted House Prices')
plt.title('Actual vs Predicted House Prices')
plt.show()

In [None]:
residuals = y_test - y_pred
plt.scatter(y_test, residuals, color="purple", alpha=0.6)
plt.axhline(y=0, color="red", linestyle="--")
plt.xlabel("Actual House Prices")
plt.ylabel("Residuals")
plt.title("Residuals vs. Actual House Prices")
plt.show()