In [7]:
#importing libraries to handle csv data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#reading dataset into a dataframe

dataset = pd.read_csv("../input/amsterdam-house-price-prediction/HousingPrices-Amsterdam-August-2021.csv")
dataset.head()

In [3]:
#dropping column that contain text data and filling NA values with mean

dataset = dataset.loc[:,['Price','Area','Room']]
mean_price = dataset['Price'].mean()
mean_area = dataset['Area'].mean()
mean_room = dataset['Room'].mean()

dataset['Price'].fillna(value = mean_price, inplace = True)
dataset['Area'].fillna(value = mean_area, inplace = True)
dataset['Room'].fillna(value = mean_room, inplace = True)

dataset.head()

In [5]:
#converting dataframe to np array, shuffling data.
X = dataset.loc[:,['Area','Room']].to_numpy()
Y = dataset.loc[:,['Price']].to_numpy()

temp = list(zip(X,Y))
np.random.shuffle(temp)
X,Y = zip(*temp)

X = np.array(X)
Y = np.array(Y)
Y = Y.reshape((len(Y),1))

print(X.shape)
print(Y.shape)

In [16]:
#plotting area vs price
plt.scatter(X[:,0],Y)
plt.xlabel("area")
plt.ylabel("price")
plt.show()

In [17]:
#plotting number of rooms vs price
plt.scatter(X[:,1],Y)
plt.xlabel("number of rooms")
plt.ylabel("price")
plt.show()

In [49]:
X = X/np.linalg.norm(X)
Y = Y/np.linalg.norm(Y)

In [69]:
#function for calculating mean squared error

def MSE(y, y_pred):
    error = 0.0
    for i in range(len(y)):
        error += (y[i] - y_pred[i])**2/len(y)
    return error

#function for evaluating mean percentage error

def MPE(y, y_pred):
    error = 0.0
    for i in range(len(y)):
        if y[i] != 0: 
            error += abs(y[i] - y_pred[i])/y[i]
    error = error/len(y)
    return error

#function to calculate R squared metric  

def RSquared(y, y_pred):
    ss_res = np.sum((y - y_pred)**2)
    ss_tot = np.sum((y - np.mean(y))**2)
    r2 = 1 - (ss_res/ss_tot)
    return r2

#function to calculate adjusted R squared metric

def RSquaredAdjusted(y, y_pred, m):
    # m is number of features
    
    r2 = RSquared(y, y_pred)
    n = y.shape[0]
    r2_ad = 1 - ((1 - r2)*(n - 1)/(n - m - 1))
    return r2_ad

def test(x_test, y_test, weights, bias):
    y_pred = np.dot(x_test, weights) + bias
    mse = MSE(y_test, y_pred)
    mpe = MPE(y_test, y_pred)
    r2 = RSquared(y_test, y_pred)
    r2_ad = RSquaredAdjusted(y_test, y_pred, x_test.shape[1])
    #print(f"MSE : {mse}, MPE : {mpe}, R Squared : {r2}, R Squared Adjusted : {r2_ad}")
    return [mse,mpe,r2,r2_ad]


In [40]:
def fit(x_train, y_train, learning_rate = 0.0001, num_iter = 10000):
    weights = np.zeros((x_train.shape[1],1))
    bias = 0
    
    for i in range(num_iter):
        #values predicted by model
        y_pred = np.dot(x_train, weights) + bias

        #derivatives for gradient descent
        del_weights = (1/x_train.shape[0])*(2*np.dot(x_train.T,(y_pred - y_train)))
        del_bias = (1/x_train.shape[0])*(2*np.sum(y_pred - y_train))
        
        #updating value of weights and bias
        weights -=  learning_rate * del_weights
        bias -= learning_rate * del_bias
    
    return weights,bias


In [41]:
def fit_ridge(x_train, y_train, learning_rate = 0.0001, num_iter = 10000, Lambda = 10):
    weights = np.zeros((x_train.shape[1],1))
    bias = 0
    
    for i in range(num_iter):
        #values predicted by model
        y_pred = np.dot(x_train, weights) + bias

        #derivatives for gradient descent
        del_weights = (1/x_train.shape[0])*(2*np.dot(x_train.T,(y_pred - y_train))) + (Lambda/x_train.shape[0])*weights
        del_bias = (1/x_train.shape[0])*(2*np.sum(y_pred - y_train))
        
        #updating value of weights and bias
        weights -=  learning_rate * del_weights
        bias -= learning_rate * del_bias
        
    return weights,bias


In [42]:
def fit_lasso(x_train, y_train, learning_rate = 0.0001, num_iter = 10000, Lambda = 10):
    weights = np.zeros((x_train.shape[1],1))
    bias = 0
    
    for i in range(num_iter):
        #values predicted by model
        y_pred = np.dot(x_train, weights) + bias

        #derivatives for gradient descent
        del_weights = (1/x_train.shape[0])*(2*np.dot(x_train.T,(y_pred - y_train)))
        del_bias = (1/x_train.shape[0])*(2*np.sum(y_pred - y_train))
        
        #adding penalty on weights
        for j in range(x_train.shape[1]):
            if(weights[j] >= 0):
                del_weights += Lambda/x_train.shape[0]
            else:
                del_weights += -1*Lambda/x_train.shape[0]
        
        #updating value of weights and bias
        weights -=  learning_rate * del_weights
        bias -= learning_rate * del_bias
        
    return weights,bias


In [92]:
def cross_validation_LR():
    k = 10
    foldsize = int(len(X)/k)
    
    train_metrics  = np.zeros(4)
    test_metrics = np.zeros(4)
    
    for i in range(k):
        xtest = X[i*foldsize:(i+1)*foldsize]
        ytest = Y[i*foldsize:(i+1)*foldsize]
        xtrain = np.concatenate((X[0:i*foldsize], X[(i+1)*foldsize:]))
        ytrain = np.concatenate((Y[0:i*foldsize], Y[(i+1)*foldsize:]))
        
        w,b = fit(xtrain,ytrain,0.1,100000)
        temp = np.array(test(xtrain , ytrain , w, b))
        train_metrics = train_metrics + temp
        
        temp = np.array(test(xtest , ytest , w, b))
        test_metrics = train_metrics + temp
        
    return train_metrics/k, test_metrics/k
        
train_met ,test_met = cross_validation_LR()
print("Train data metrics for Linear Regression without regularization")
print(f"MSE : {train_met[0]} , MPE : {train_met[1]} , R2 : {train_met[2]} , R2 adjusted : {train_met[3]}")
print("Test data metrics for Linear Regression without regularization")
print(f"MSE : {test_met[0]} , MPE : {test_met[1]} , R2 : {test_met[2]} , R2 adjusted : {test_met[3]}")

In [97]:
def cross_validation_ridge():
    k = 10
    foldsize = int(len(X)/k)
    
    train_metrics  = np.zeros(4)
    test_metrics = np.zeros(4)
    
    for i in range(k):
        xtest = X[i*foldsize:(i+1)*foldsize]
        ytest = Y[i*foldsize:(i+1)*foldsize]
        xtrain = np.concatenate((X[0:i*foldsize], X[(i+1)*foldsize:]))
        ytrain = np.concatenate((Y[0:i*foldsize], Y[(i+1)*foldsize:]))
        
        w,b = fit_ridge(xtrain,ytrain,0.1,100000, 0.1)
        temp = np.array(test(xtrain , ytrain , w, b))
        train_metrics = train_metrics + temp
        
        temp = np.array(test(xtest , ytest , w, b))
        test_metrics = train_metrics + temp
        
    return train_metrics/k, test_metrics/k
        
train_met ,test_met = cross_validation_ridge()
print("Train data metrics for Linear Regression with ridge regularization")
print(f"MSE : {train_met[0]} , MPE : {train_met[1]} , R2 : {train_met[2]} , R2 adjusted : {train_met[3]}")
print("Test data metrics for Linear Regression with ridge regularization")
print(f"MSE : {test_met[0]} , MPE : {test_met[1]} , R2 : {test_met[2]} , R2 adjusted : {test_met[3]}")

In [98]:
def cross_validation_lasso():
    k = 10
    foldsize = int(len(X)/k)
    
    train_metrics  = np.zeros(4)
    test_metrics = np.zeros(4)
    
    for i in range(k):
        xtest = X[i*foldsize:(i+1)*foldsize]
        ytest = Y[i*foldsize:(i+1)*foldsize]
        xtrain = np.concatenate((X[0:i*foldsize], X[(i+1)*foldsize:]))
        ytrain = np.concatenate((Y[0:i*foldsize], Y[(i+1)*foldsize:]))
        
        w,b = fit_lasso(xtrain,ytrain,0.1,100000, 0.1)
        temp = np.array(test(xtrain , ytrain , w, b))
        train_metrics = train_metrics + temp
        
        temp = np.array(test(xtest , ytest , w, b))
        test_metrics = train_metrics + temp
        
    return train_metrics/k, test_metrics/k
        
train_met ,test_met = cross_validation_lasso()
print("Train data metrics for Linear Regression with lasso regularization")
print(f"MSE : {train_met[0]} , MPE : {train_met[1]} , R2 : {train_met[2]} , R2 adjusted : {train_met[3]}")
print("Test data metrics for Linear Regression with lasso regularization")
print(f"MSE : {test_met[0]} , MPE : {test_met[1]} , R2 : {test_met[2]} , R2 adjusted : {test_met[3]}")

We can notice Mean Percentage Error was higher on test data than train data indicating overfitting. Regularization reduced MPE on test data. Ridge regression performed better overall