In [1]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [2]:
#load data frame with cleaned data, convert zipcode and MLS# columns to strings
df = pd.read_csv("C:/Users/jklenarz/Downloads/cleaned_redfin_dakota_county_3_15.csv")
df['ZIP OR POSTAL CODE'] = df['ZIP OR POSTAL CODE'].astype(str)
df['MLS#'] = df['MLS#'].astype(str)
df.head()

Unnamed: 0,PROPERTY TYPE,ADDRESS,CITY,ZIP OR POSTAL CODE,PRICE,BEDS,BATHS,LOCATION,SQUARE FEET,LOT SIZE,YEAR BUILT,DAYS ON MARKET,HOA/MONTH,MLS#,LATITUDE,LONGITUDE
0,Townhouse,845 Wescott Sq,Eagan,55123,349000,3,3,Eagan,1546,5445.0,1994,5,250.0,6159156,44.833087,-93.129735
1,Townhouse,18314 Gladden Ln,Lakeville,55044,425000,4,4,Lakeville,2290,1785.0,2019,19,205.0,6153258,44.683365,-93.220157
2,Single Family Residential,17904 Greenwich Way,Lakeville,55044,574900,4,3,Summers Creek,3194,4486.0,2022,1,160.0,6164143,44.691318,-93.226815
3,Single Family Residential,17908 Greenwich Way,Lakeville,55044,679900,5,3,Summers Creek,3611,4486.0,2022,1,160.0,6163589,44.691269,-93.226611
4,Townhouse,4837 Bisset Ln #8105,Inver Grove Heights,55076,280000,2,2,Inver Grove Heights,1524,26264.151515,2005,1,336.0,6164131,44.878561,-93.055997


In [3]:
#process data one-hot-encoding for location, property type; run regression on bed,bath,sqfeet,lotsize,proptype,location,HOA/month,?yearbuilt,?daysonmar
newdf = pd.get_dummies(df,columns=['LOCATION','PROPERTY TYPE'])
newdf.head()

Unnamed: 0,ADDRESS,CITY,ZIP OR POSTAL CODE,PRICE,BEDS,BATHS,SQUARE FEET,LOT SIZE,YEAR BUILT,DAYS ON MARKET,...,LOCATION_The Reserve at Twin Lakes,LOCATION_Vita Attiva At South Creek,LOCATION_Vita Attiva at South Creek,LOCATION_Welch,LOCATION_West Saint Paul,LOCATION_Wood Haven,PROPERTY TYPE_Condo/Co-op,PROPERTY TYPE_Multi-Family (2-4 Unit),PROPERTY TYPE_Single Family Residential,PROPERTY TYPE_Townhouse
0,845 Wescott Sq,Eagan,55123,349000,3,3,1546,5445.0,1994,5,...,0,0,0,0,0,0,0,0,0,1
1,18314 Gladden Ln,Lakeville,55044,425000,4,4,2290,1785.0,2019,19,...,0,0,0,0,0,0,0,0,0,1
2,17904 Greenwich Way,Lakeville,55044,574900,4,3,3194,4486.0,2022,1,...,0,0,0,0,0,0,0,0,1,0
3,17908 Greenwich Way,Lakeville,55044,679900,5,3,3611,4486.0,2022,1,...,0,0,0,0,0,0,0,0,1,0
4,4837 Bisset Ln #8105,Inver Grove Heights,55076,280000,2,2,1524,26264.151515,2005,1,...,0,0,0,0,0,0,0,0,0,1


In [4]:
#create features set
X = newdf.drop(['ADDRESS','CITY','ZIP OR POSTAL CODE','PRICE','MLS#','LATITUDE','LONGITUDE'],axis=1)
X.head()

Unnamed: 0,BEDS,BATHS,SQUARE FEET,LOT SIZE,YEAR BUILT,DAYS ON MARKET,HOA/MONTH,LOCATION_AMBERWOOD,LOCATION_Apple Valley,LOCATION_Ardan Place,...,LOCATION_The Reserve at Twin Lakes,LOCATION_Vita Attiva At South Creek,LOCATION_Vita Attiva at South Creek,LOCATION_Welch,LOCATION_West Saint Paul,LOCATION_Wood Haven,PROPERTY TYPE_Condo/Co-op,PROPERTY TYPE_Multi-Family (2-4 Unit),PROPERTY TYPE_Single Family Residential,PROPERTY TYPE_Townhouse
0,3,3,1546,5445.0,1994,5,250.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,4,4,2290,1785.0,2019,19,205.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,4,3,3194,4486.0,2022,1,160.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,5,3,3611,4486.0,2022,1,160.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,2,2,1524,26264.151515,2005,1,336.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
#create target vector
Y = newdf['PRICE'].to_numpy()

In [6]:
#split data into a training and testing set
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=2)

In [8]:
#fit a linear model to the training set and check accuracy on the test set
regr = linear_model.LinearRegression()
model = regr.fit(X_train,Y_train)

Yhat= regr.predict(X_test)
score=r2_score(Y_test,Yhat)
print("R2 score is: %.2f"%score)
print("Mean squared error is: %.2f"%mean_squared_error(Y_test,Yhat))
print("Root mean squared error of is: %.2f"%np.sqrt(mean_squared_error(Y_test,Yhat)))

R2 score is: 0.56
Mean squared error is: 32090544095.68
Root mean squared error of is: 179138.34


In [9]:
feature_names = list(X.columns)

coefs = pd.DataFrame(regr.coef_,columns=["Coefficients"],index=feature_names,)

coefs

Unnamed: 0,Coefficients
BEDS,-58802.849597
BATHS,36520.748113
SQUARE FEET,144.433326
LOT SIZE,1.115496
YEAR BUILT,1618.002767
...,...
LOCATION_Wood Haven,0.000000
PROPERTY TYPE_Condo/Co-op,-72561.470316
PROPERTY TYPE_Multi-Family (2-4 Unit),-3329.722391
PROPERTY TYPE_Single Family Residential,91509.568502


In [10]:
newdf['PRICE'].mean()

532898.0515759313

Re-run without days on market and year built


In [11]:
#create features set
X2 = newdf.drop(['ADDRESS','CITY','ZIP OR POSTAL CODE','PRICE','YEAR BUILT','DAYS ON MARKET','MLS#','LATITUDE','LONGITUDE'],axis=1)
X2.head()

Unnamed: 0,BEDS,BATHS,SQUARE FEET,LOT SIZE,HOA/MONTH,LOCATION_AMBERWOOD,LOCATION_Apple Valley,LOCATION_Ardan Place,LOCATION_Aspen Grove,LOCATION_Avebury Place,...,LOCATION_The Reserve at Twin Lakes,LOCATION_Vita Attiva At South Creek,LOCATION_Vita Attiva at South Creek,LOCATION_Welch,LOCATION_West Saint Paul,LOCATION_Wood Haven,PROPERTY TYPE_Condo/Co-op,PROPERTY TYPE_Multi-Family (2-4 Unit),PROPERTY TYPE_Single Family Residential,PROPERTY TYPE_Townhouse
0,3,3,1546,5445.0,250.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,4,4,2290,1785.0,205.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,4,3,3194,4486.0,160.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,5,3,3611,4486.0,160.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,2,2,1524,26264.151515,336.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [12]:
#split data into a training and testing set
X2_train, X2_test, Y2_train, Y2_test = train_test_split(X2,Y,test_size=0.2,random_state=2)

In [14]:
#fit a linear model to the training set and check accuracy on the test set
regr2 = linear_model.LinearRegression()
model = regr2.fit(X2_train,Y2_train)

Yhat2= regr2.predict(X2_test)
score=r2_score(Y2_test,Yhat2)
print("R2 score is: %.2f"%score)
print("Mean squared error is: %.2f"%mean_squared_error(Y2_test,Yhat2))
print("Root mean squared error of is: %.2f"%np.sqrt(mean_squared_error(Y2_test,Yhat2)))

R2 score is: 0.50
Mean squared error is: 35893624017.58
Root mean squared error of is: 189456.13
