In [17]:
import pandas as pd
import numpy as np
from numpy.linalg import inv

In [18]:
def trainTestSplit(df,splitPercentage):
    '''test train splitting and converting it to numpy array'''
    df_test = df[df.index <= len(df)*splitPercentage]
    df_train = df[df.index > len(df)*splitPercentage]
    y_test = df_test[df.columns[-1]].to_numpy().reshape(len(df_test),1)
    y_train = df_train[df.columns[-1]].to_numpy().reshape(len(df_train),1)
    X_test = df_test.iloc[:,0:len(df.columns)-1]
    X_train = df_train.iloc[:,0:len(df.columns)-1]
    return X_train,X_test,y_train,y_test

In [19]:
def addX_0(X):
    '''adding X_0 which would be 1'''
    a = np.ones((X.shape[0],1))
    X = np.append(a,X,axis=1)
    return(X)

In [20]:
def getTheta(X,y):
    '''calculate theta values using normal equation'''
    X_transpose = X.T
    theta = inv(X_transpose.dot(X)).dot(X_transpose).dot(y)
    return theta

In [21]:
def mse(y_test,y_pred):
    '''calculate MSE'''
    arr_len = y_test.shape[0]
    mse_val = 0
    for i in range(arr_len):
        mse_val += (y_pred[i] - y_test[i])**2
    return mse_val

In [22]:
def pipeline(df):
    '''Pipeline for regression using Normal equation'''
    X_train,X_test,y_train,y_test = trainTestSplit(df,0.30)
    X_train = addX_0(X_train)
    X_test = addX_0(X_test)
    theta = getTheta(X_train,y_train)
    y_pred = X_test.dot(theta)
    return(mse(y_test,y_pred))

In [23]:
if __name__ == "__main__":
    df = pd.read_csv('real_estate_data.csv')
    mse = pipeline(df)
    print(mse)

[9327.49647662]
