In [1]:
import pandas as pd
import numpy as np

In [2]:
def addIntercept(df):
    intercept = pd.DataFrame(np.ones((len(df),1)))
    df = pd.concat((intercept,df),axis=1)
    return df

In [3]:
def trainTestSplit(df,splitPercentage):
    df_train = df[df.index > len(df)*splitPercentage]
    df_test = df[df.index <= len(df)*splitPercentage]
    y_train = df_train.iloc[:,-1].values.reshape((len(df_train),1))
    y_test = df_test.iloc[:,-1].values.reshape((len(df_test),1))
    X_train = df_train.iloc[:,:-1].values
    X_test = df_test.iloc[:,:-1].values
    return X_train,X_test,y_train,y_test

In [4]:
def initializeTheta(df_col_len):
    theta = np.zeros((df_col_len-1,1))
    return theta

In [5]:
def costFunction(X,y,theta):
    m = y.shape[0]
    h_theta = X.dot(theta)
    cost = (1/2*m)*(np.sum(np.square(h_theta - y)))
    return cost

In [12]:
def gradientDescent(learning_rate,theta,X_train,y_train,epochs):
    m = X_train.shape[0]
    cost = float('inf')
    for i in range(epochs):
        for j in range(m):
            h_theta = X_train.dot(theta)
            theta = theta - (1/m)*learning_rate*(X_train.T.dot((h_theta-y_train)))
            cost += costFunction(X_train,y_train,theta)
    return theta

In [13]:
def pipeline(df):
    df = addIntercept(df)
    X_train,X_test,y_train,y_test = trainTestSplit(df,0.30)
    theta = initializeTheta(len(df.columns))
    theta = gradientDescent(1e-7,theta,X_train,y_train,10000)
    mse = costFunction(X_test,y_test,theta)
    return(mse)
    

In [14]:
if __name__ == "__main__":
    df = pd.read_csv('real_estate_data.csv')
    mse = pipeline(df)
    print(mse)

1990318.7853002988
