In [1]:
#import dependencies 
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
import psycopg2
from config import db_password

In [2]:
#Setup database
db_string = f"postgresql://postgres:{db_password}@housing-prices.ch2ctomvepex.us-east-1.rds.amazonaws.com:5432/housing-prices"
engine = create_engine(db_string)

#Reflect
Base = automap_base()
#Reflect the tables
Base.prepare(engine, reflect=True)

In [21]:
#loading csv files into one dataframe
#did not add Pittsburg.csv as it amenity count column was named something else. too much work for only 86 values
housing_df = pd.concat(map(pd.read_csv, ["sf_clean_data.csv","austin_clean_data.csv",
                                        "boston_clean_data.csv","chandler_clean_data.csv",
                                        "chicago_clean_data.csv","la_clean_data.csv",
                                        "spokane_clean_data.csv","houston_clean_data.csv"]))
housing_df

Unnamed: 0,price,bedroom,sqft,neighborhood,bathroom,amenity_count,cityID
0,3985,2,920.0,cole valley / ashbury hts,1.0,5,2.0
1,3961,1,745.0,SOMA / south beach,1.0,9,2.0
2,7435,3,1146.0,SOMA / south beach,2.0,9,2.0
3,2785,2,1000.0,alamo square / nopa,1.5,7,2.0
4,7250,3,1146.0,SOMA / south beach,2.0,9,2.0
...,...,...,...,...,...,...,...
60,1142,2,1076.0,"7000 Fonvilla St, Houston, TX",1.0,7,
61,999,1,716.0,Houston,1.0,5,
62,864,1,700.0,Houston,1.0,5,
63,1100,2,900.0,"9550 Long Point Road, Houston, TX",1.0,8,


In [22]:
#dropping columns for the ML model
housing_df = housing_df.drop(['neighborhood','cityID'],axis=1)
housing_df

Unnamed: 0,price,bedroom,sqft,bathroom,amenity_count
0,3985,2,920.0,1.0,5
1,3961,1,745.0,1.0,9
2,7435,3,1146.0,2.0,9
3,2785,2,1000.0,1.5,7
4,7250,3,1146.0,2.0,9
...,...,...,...,...,...
60,1142,2,1076.0,1.0,7
61,999,1,716.0,1.0,5
62,864,1,700.0,1.0,5
63,1100,2,900.0,1.0,8


In [24]:
#split the target variable 
from sklearn.model_selection import train_test_split

X= housing_df.drop(["price"],axis=1)
y= housing_df["price"]

In [25]:
#splitting the data into training and testing sets. Set the testing set to 20% 
#so that means 70% of the data would be used to train the model

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2)

In [27]:
#scaling the data
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train, y_train = housing_df.drop(["price"],axis=1), housing_df["price"]

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
reg = LinearRegression()

reg.fit(X_train_scaled,y_train)

LinearRegression()

In [28]:
#machine learning accuracy results
reg.score(X_test_scaled,y_test)

0.2986399565618376

In [29]:
#that was linear regression model
# now will use a random forest model

In [30]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor()

forest.fit(X_train_scaled,y_train)

RandomForestRegressor()

In [31]:
#accuracy results
forest.score(X_test_scaled,y_test)

0.9215823762994333

In [32]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [3,10,30],
    "max_features": [2,4,6,8]      
}
grid_search = GridSearchCV(forest, param_grid, cv=5,
                          scoring="neg_mean_squared_error",
                          return_train_score=True)

grid_search.fit(X_train_scaled, y_train)

30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/victoralvarado/opt/anaconda3/envs/mlenv/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/victoralvarado/opt/anaconda3/envs/mlenv/lib/python3.7/site-packages/sklearn/ensemble/_forest.py", line 459, in fit
    for i, t in enumerate(trees)
  File "/Users/victoralvarado/opt/anaconda3/envs/mlenv/lib/python3.7/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/victoralvarado/opt/anaconda3/envs/mle

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_features': [2, 4, 6, 8],
                         'n_estimators': [3, 10, 30]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [33]:
best_forest= grid_search.best_estimator_

In [34]:
best_forest.score(X_test_scaled,y_test)

0.9235716660073271