In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [2]:
#importing csv
housedDF = pd.read_csv("sf_clean.csv")
housedDF.head()

Unnamed: 0,price,sqft,beds,bath,laundry,pets,housing_type,parking,hood_district
0,6800,1600.0,2.0,2.0,(a) in-unit,(d) no pets,(c) multi,(b) protected,7.0
1,3500,550.0,1.0,1.0,(a) in-unit,(a) both,(c) multi,(b) protected,7.0
2,5100,1300.0,2.0,1.0,(a) in-unit,(a) both,(c) multi,(d) no parking,7.0
3,9000,3500.0,3.0,2.5,(a) in-unit,(d) no pets,(c) multi,(b) protected,7.0
4,3100,561.0,1.0,1.0,(c) no laundry,(a) both,(c) multi,(d) no parking,7.0


In [3]:
housedDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 989 entries, 0 to 988
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          989 non-null    int64  
 1   sqft           989 non-null    float64
 2   beds           989 non-null    float64
 3   bath           989 non-null    float64
 4   laundry        989 non-null    object 
 5   pets           989 non-null    object 
 6   housing_type   989 non-null    object 
 7   parking        989 non-null    object 
 8   hood_district  989 non-null    float64
dtypes: float64(4), int64(1), object(4)
memory usage: 69.7+ KB


In [4]:
housedDF.describe()

Unnamed: 0,price,sqft,beds,bath,hood_district
count,989.0,989.0,989.0,989.0,989.0
mean,3595.035389,976.76542,1.679474,1.390293,7.052578
std,1546.22267,474.629798,1.07671,0.562714,2.404716
min,750.0,150.0,0.0,1.0,1.0
25%,2650.0,650.0,1.0,1.0,6.0
50%,3300.0,900.0,2.0,1.0,8.0
75%,4242.0,1200.0,2.0,2.0,9.0
max,19000.0,3500.0,6.0,4.0,10.0


In [5]:
housedDF.columns

Index(['price', 'sqft', 'beds', 'bath', 'laundry', 'pets', 'housing_type',
       'parking', 'hood_district'],
      dtype='object')

In [6]:
#list categorical variables
df_cat= housedDF.dtypes[housedDF.dtypes == "object"].index.tolist()

In [7]:
#encoding columns
enc = OneHotEncoder(sparse=False)

df_encode = pd.DataFrame(enc.fit_transform(housedDF[df_cat]))

df_encode.columns = enc.get_feature_names_out(df_cat)
df_encode.head()

Unnamed: 0,laundry_(a) in-unit,laundry_(b) on-site,laundry_(c) no laundry,pets_(a) both,pets_(b) dogs,pets_(c) cats,pets_(d) no pets,housing_type_(a) single,housing_type_(b) double,housing_type_(c) multi,parking_(a) valet,parking_(b) protected,parking_(c) off-street,parking_(d) no parking
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [8]:
#merging encoded df with df
housedDF= housedDF.merge(df_encode,left_index=True,right_index=True)
housedDF=housedDF.drop(columns=df_cat,axis=1)
housedDF.head()

Unnamed: 0,price,sqft,beds,bath,hood_district,laundry_(a) in-unit,laundry_(b) on-site,laundry_(c) no laundry,pets_(a) both,pets_(b) dogs,pets_(c) cats,pets_(d) no pets,housing_type_(a) single,housing_type_(b) double,housing_type_(c) multi,parking_(a) valet,parking_(b) protected,parking_(c) off-street,parking_(d) no parking
0,6800,1600.0,2.0,2.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,3500,550.0,1.0,1.0,7.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,5100,1300.0,2.0,1.0,7.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,9000,3500.0,3.0,2.5,7.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,3100,561.0,1.0,1.0,7.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [9]:
#dropping hood_distric column
housedDF = housedDF.drop(columns="hood_district",axis=1)
housedDF.head()

Unnamed: 0,price,sqft,beds,bath,laundry_(a) in-unit,laundry_(b) on-site,laundry_(c) no laundry,pets_(a) both,pets_(b) dogs,pets_(c) cats,pets_(d) no pets,housing_type_(a) single,housing_type_(b) double,housing_type_(c) multi,parking_(a) valet,parking_(b) protected,parking_(c) off-street,parking_(d) no parking
0,6800,1600.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,3500,550.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,5100,1300.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,9000,3500.0,3.0,2.5,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,3100,561.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [10]:
#sns.pairplot(housedDF)

In [11]:
#sns.heatmap(housedDF.corr(), annot=True)

In [12]:
#split our preprocessed data into our features and target arrays
X = housedDF[['sqft', 'beds', 'bath', 'laundry_(a) in-unit',
       'laundry_(b) on-site', 'laundry_(c) no laundry', 'pets_(a) both',
       'pets_(b) dogs', 'pets_(c) cats', 'pets_(d) no pets',
       'housing_type_(a) single', 'housing_type_(b) double',
       'housing_type_(c) multi', 'parking_(a) valet', 'parking_(b) protected',
       'parking_(c) off-street', 'parking_(d) no parking']]

y = housedDF["price"]

In [13]:
#split preprocessed data into a training and testing dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [14]:
# Scale the training data
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
#create an instance of the model
model = LinearRegression()

In [17]:
#training the model
model.fit(X,y)

LinearRegression()

In [18]:
#create predictions
y_pred = model.predict(X)
print(y_pred.shape)

(989,)


In [None]:
y.shape

In [None]:
#plot predictions
plt.scatter(X,y)
plt.plot(X,y_pred, color="red")
plt.show()

In [None]:
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=["Coefficient"])

In [None]:
coeff_df

In [None]:
predictions= lm.predict(X_test)
results = pd.DataFrame({
   "Prediction": predictions,
   "Actual": y_test
}).reset_index(drop=True)
results.head()

In [None]:
plt.scatter(y_test,predictions)

In [None]:
sns.distplot((y_test-predictions),bins=50);

In [None]:
#asses the accuracy score of the model
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predictions)