In [50]:
#import dependencies 
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
import psycopg2
from config import db_password

In [51]:
#Setup database
db_string = f"postgresql://postgres:{db_password}@housing-prices.ch2ctomvepex.us-east-1.rds.amazonaws.com:5432/housing-prices"
engine = create_engine(db_string)

#Reflect
Base = automap_base()
#Reflect the tables
Base.prepare(engine, reflect=True)

In [52]:
#Create Session
session = Session(engine)

In [53]:
# Example - load data from RDS into pandas DataFrame

sf_db = pd.read_sql_table('sanfran', engine)
sf_db.head()



Unnamed: 0,price,bedroom,sqft,neighborhood,bathroom,amenity_count,cityid
0,3985,2,920.0,cole valley / ashbury hts,1.0,5,2
1,3961,1,745.0,SOMA / south beach,1.0,9,2
2,7435,3,1146.0,SOMA / south beach,2.0,9,2
3,2785,2,1000.0,alamo square / nopa,1.5,7,2
4,7250,3,1146.0,SOMA / south beach,2.0,9,2


In [54]:
#importing csv
housedDF = pd.read_csv("sf_clean.csv")
housedDF.head()

Unnamed: 0,price,sqft,beds,bath,laundry,pets,housing_type,parking,hood_district
0,6800,1600.0,2.0,2.0,(a) in-unit,(d) no pets,(c) multi,(b) protected,7.0
1,3500,550.0,1.0,1.0,(a) in-unit,(a) both,(c) multi,(b) protected,7.0
2,5100,1300.0,2.0,1.0,(a) in-unit,(a) both,(c) multi,(d) no parking,7.0
3,9000,3500.0,3.0,2.5,(a) in-unit,(d) no pets,(c) multi,(b) protected,7.0
4,3100,561.0,1.0,1.0,(c) no laundry,(a) both,(c) multi,(d) no parking,7.0


In [55]:
housedDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 989 entries, 0 to 988
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          989 non-null    int64  
 1   sqft           989 non-null    float64
 2   beds           989 non-null    float64
 3   bath           989 non-null    float64
 4   laundry        989 non-null    object 
 5   pets           989 non-null    object 
 6   housing_type   989 non-null    object 
 7   parking        989 non-null    object 
 8   hood_district  989 non-null    float64
dtypes: float64(4), int64(1), object(4)
memory usage: 69.7+ KB


In [57]:
#dropping hood_distric and pets column
housedDF = housedDF.drop(["pets","hood_district"],axis=1)


KeyError: "['pets' 'hood_district'] not found in axis"

In [58]:
housedDF.head()


Unnamed: 0,price,sqft,beds,bath,laundry,housing_type,parking
0,6800,1600.0,2.0,2.0,(a) in-unit,(c) multi,(b) protected
1,3500,550.0,1.0,1.0,(a) in-unit,(c) multi,(b) protected
2,5100,1300.0,2.0,1.0,(a) in-unit,(c) multi,(d) no parking
3,9000,3500.0,3.0,2.5,(a) in-unit,(c) multi,(b) protected
4,3100,561.0,1.0,1.0,(c) no laundry,(c) multi,(d) no parking


In [59]:
#encoding features and dropping the original un-encoded columns
housedDF=housedDF.join(pd.get_dummies(housedDF.laundry)).drop(["laundry"],axis=1)
housedDF= housedDF.join(pd.get_dummies(housedDF.housing_type)).drop(["housing_type"],axis=1)
housedDF = housedDF.join(pd.get_dummies(housedDF.parking)).drop(["parking"],axis=1)

In [61]:
housedDF.head()

Unnamed: 0,price,sqft,beds,bath,(a) in-unit,(b) on-site,(c) no laundry,(a) single,(b) double,(c) multi,(a) valet,(b) protected,(c) off-street,(d) no parking
0,6800,1600.0,2.0,2.0,1,0,0,0,0,1,0,1,0,0
1,3500,550.0,1.0,1.0,1,0,0,0,0,1,0,1,0,0
2,5100,1300.0,2.0,1.0,1,0,0,0,0,1,0,0,0,1
3,9000,3500.0,3.0,2.5,1,0,0,0,0,1,0,1,0,0
4,3100,561.0,1.0,1.0,0,0,1,0,0,1,0,0,0,1


In [62]:
#split the target variable 
from sklearn.model_selection import train_test_split

X= housedDF.drop(["price"],axis=1)
y= housedDF["price"]

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2)

In [64]:
train_data = X_train.join(y_train)

In [65]:
y_train

421    2500
492    3895
930    4195
633    5495
796    3495
       ... 
713    3995
517    3898
559    3900
48     2600
389    3500
Name: price, Length: 791, dtype: int64

In [72]:
#scaling the data
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train, y_train = housedDF.drop(["price"],axis=1), housedDF["price"]

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
reg = LinearRegression()

reg.fit(X_train_scaled,y_train)

LinearRegression()

In [73]:
test_data = X_test.join(y_test)

test_data

Unnamed: 0,sqft,beds,bath,(a) in-unit,(b) on-site,(c) no laundry,(a) single,(b) double,(c) multi,(a) valet,(b) protected,(c) off-street,(d) no parking,price
827,550.0,1.0,1.0,0,0,1,0,0,1,0,0,0,1,1890
902,600.0,1.0,1.0,0,0,1,0,0,1,0,1,0,0,2300
811,656.0,1.0,1.0,0,1,0,0,0,1,0,0,0,1,1674
904,900.0,1.0,1.0,0,1,0,0,0,1,0,1,0,0,3250
347,650.0,1.0,1.0,0,1,0,0,0,1,0,1,0,0,2500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306,1800.0,4.0,2.0,0,1,0,0,0,1,0,0,0,1,3995
800,400.0,0.0,1.0,0,1,0,0,0,1,0,0,0,1,1600
770,1050.0,2.0,1.0,0,1,0,0,0,1,0,1,0,0,4100
894,1075.0,2.0,1.0,0,1,0,0,0,1,0,0,0,1,3595


In [75]:
#machine learning accuracy results
reg.score(X_test_scaled,y_test)

0.7520804354895171

In [76]:
#that was linear regression model
# now will use a random forest model

In [77]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor()

forest.fit(X_train_scaled,y_train)

RandomForestRegressor()

In [78]:
forest.score(X_test_scaled,y_test)

0.9353719265238171

In [79]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [3,10,30],
    "max_features": [2,4,6,8]      
}
grid_search = GridSearchCV(forest, param_grid, cv=5,
                          scoring="neg_mean_squared_error",
                          return_train_score=True)

grid_search.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_features': [2, 4, 6, 8],
                         'n_estimators': [3, 10, 30]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [80]:
best_forest= grid_search.best_estimator_

In [81]:
best_forest.score(X_test_scaled,y_test)

0.942124440909403