In [37]:
# Import libraries
# you can install missing library using pip install numpy 
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Download the dataset
url = "https://raw.githubusercontent.com/leroymusa/EECS3401-Final-Project/main/house_prices.csv"
housing_prices = pd.read_csv(url, sep=',')

# backup copy of the dataset
housing_backup = housing_prices
housing_prices = housing_prices.head(50)
housing_prices

Unnamed: 0.1,Unnamed: 0,property_type,price,location,city,baths,purpose,bedrooms,Area_in_Marla
0,0,Flat,10000000,G-10,Islamabad,2,For Sale,2,4.0
1,1,Flat,6900000,E-11,Islamabad,3,For Sale,3,5.6
2,2,House,16500000,G-15,Islamabad,6,For Sale,5,8.0
3,3,House,43500000,Bani Gala,Islamabad,4,For Sale,4,40.0
4,4,House,7000000,DHA Defence,Islamabad,3,For Sale,3,8.0
5,7,Flat,7800000,E-11,Islamabad,2,For Sale,2,6.2
6,9,Penthouse,40000000,F-11,Islamabad,5,For Sale,5,20.0
7,10,Flat,35000000,Diplomatic Enclave,Islamabad,3,For Sale,3,7.1
8,13,Flat,13500000,DHA Defence,Islamabad,5,For Sale,3,10.0
9,14,Flat,3600000,E-11,Islamabad,1,For Sale,1,3.1


In [38]:
housing_prices = housing_prices.drop('Unnamed: 0', axis=1) # dropping unnecessary column 
housing_prices = housing_prices[(housing_prices != 0).all(axis=1)] # removes rows that have zero values in any of the columns
housing_prices = housing_prices.drop_duplicates()

In [39]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
# housing_prices = housing_prices.drop('location', axis = 1)
num_cols = housing_prices.select_dtypes(include='number').columns.to_list()
cat_cols = housing_prices.select_dtypes(exclude='number').columns.to_list()

num_cols.remove("price")

#create pipelines for numeric and categorical columns

num_pipeline = make_pipeline(SimpleImputer(strategy = 'mean'), StandardScaler())
# sparse output is required or creates an error
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse_output=False))


#use ColumnTransformer to set the estimators and transformations
preprocessing = ColumnTransformer([('num', num_pipeline, num_cols), ('cat', cat_pipeline, cat_cols)], remainder='passthrough')

preprocessing

In [40]:
housing_prepared = preprocessing.fit_transform(housing_prices)

feature_names=preprocessing.get_feature_names_out()
housing_prepared = pd.DataFrame(housing_prepared, columns=feature_names)

housing_prepared


Unnamed: 0,num__baths,num__bedrooms,num__Area_in_Marla,cat__property_type_Flat,cat__property_type_House,cat__property_type_Lower Portion,cat__property_type_Penthouse,cat__location_Allama Iqbal Town,cat__location_Askari,cat__location_B-17,...,cat__location_Koral Town,cat__location_Military Accounts Housing Society,cat__location_Multan Road,cat__location_Pakistan Town,cat__location_Soan Garden,cat__city_Islamabad,cat__city_Lahore,cat__purpose_For Rent,cat__purpose_For Sale,remainder__price
0,-1.285008,-1.231042,-0.88097,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,10000000.0
1,-0.607202,-0.527589,-0.68174,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,6900000.0
2,1.426218,0.879316,-0.382895,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,16500000.0
3,0.070605,0.175863,3.601706,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,43500000.0
4,-0.607202,-0.527589,-0.382895,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,7000000.0
5,-1.285008,-1.231042,-0.607029,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,7800000.0
6,0.748411,0.879316,1.11133,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,40000000.0
7,-0.607202,-0.527589,-0.494962,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,35000000.0
8,0.748411,-0.527589,-0.133858,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,13500000.0
9,-1.962815,-1.934494,-0.993037,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3600000.0


In [41]:
from sklearn.model_selection import train_test_split

x = housing_prepared.drop(["remainder__price"], axis=1)
y = housing_prepared["remainder__price"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(38, 42) (38,) (10, 42) (10,)


In [42]:
# Model 1
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
# training the model on the training data
lr_model.fit(x_train,y_train)

In [43]:
y_pred = lr_model.predict(x_test)
y_pred

array([ 1.60216347e+18,  2.36398720e+07, -3.65285760e+07,  3.72805760e+07,
        2.87223052e+18,  2.62408320e+07, -5.73064451e+18,  6.41926400e+06,
        1.61848960e+07, -1.27006711e+18])

In [44]:
y_test

27       17000.0
40    21200000.0
26      175000.0
43    40000000.0
24       33000.0
37    27500000.0
12    26900000.0
19     6300000.0
4      7000000.0
25       45000.0
Name: remainder__price, dtype: float64

In [45]:
from sklearn.metrics import mean_squared_error
import math
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

In [46]:
def get_mse (model) : 
    y_pred = model.predict(x_test)
    return mean_squared_error(y_test,y_pred)

In [47]:
columns = ["MSE", "MAE", "RMSE", "R2"]
labels = ['Linear Regression', "Decision Tree Regression", "Random Forest Regression"]
rows = [[mse_lr, mae_lr, rmse_lr,r2_lr], [mse_dtr, mae_dtr, rmse_dtr, r2_dtr], [mse_rfr, mae_rfr, rmse_rfr, r2_rfr]]
pd.DataFrame(rows, columns=columns, index = labels)

LR mean standard error: 4.526999286614878e+36
