In [1]:
#Ignore warnings
import warnings
warnings.filterwarnings("ignore")

#imports for user defined functions
from env import host, user, password, get_db_url

# Imports for arithmetic calculations and data frame manipulation
import math
import numpy as np
import pandas as pd

#imports for splitting data and imputing
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

#Imports for creating visualizations
import matplotlib.pyplot as plt 
import seaborn as sns

import viz

from pydataset import data

import wrangle as w
import explore as ex

## Acquire and Preprocess

In [2]:
train, validate, test = w.wrangle_zillow()

# Add the age column
train['age'] = train.year_built.max() - train.year_built
validate['age'] = validate.year_built.max() - validate.year_built
test['age'] = test.year_built.max() - test.year_built

train.head()

Using cached csv...


Unnamed: 0,bathrooms,bedrooms,building_quality,home_sqft,fips,latitude,longitude,lot_sqft,regionidcity,regionidcounty,...,assessed_value,land_tax_value,tax_amount,censustractandblock,logerror,transactiondate,heating_system,land_use,county,age
67577,2.0,2.0,6.0,815.0,6059.0,33747500.0,-117863000.0,7140.0,47568.0,1286.0,...,167958.0,71579.0,2215.2,60590750022024.0,0.230662,2017-08-17,,Condominium,Orange,27.0
5673,2.0,2.0,8.0,1117.0,6037.0,33820500.0,-118342000.0,986080.0,54722.0,3101.0,...,95425.0,65215.0,1209.14,60376511022012.0,-0.002682,2017-01-26,Central,Condominium,Los Angeles,53.0
16346,3.0,4.0,6.0,1469.0,6037.0,33805856.0,-118124873.0,6145.0,46298.0,3101.0,...,443207.0,310660.0,5404.6,60375742012000.0,0.067708,2017-03-13,Floor/Wall,Single Family Residential,Los Angeles,64.0
24690,3.0,4.0,6.0,2030.0,6059.0,33606164.0,-117670415.0,8662.0,12773.0,1286.0,...,412426.0,243784.0,4196.14,60590320031009.0,0.002729,2017-04-10,,Single Family Residential,Orange,48.0
29587,1.0,2.0,6.0,864.0,6059.0,33844336.0,-117962041.0,7200.0,16764.0,1286.0,...,444000.0,389732.0,5314.98,60590868022006.0,0.017012,2017-04-25,,Single Family Residential,Orange,68.0


In [3]:
cols = ['heating_system', 'land_use', 'county']

train = w.get_dummies(train, cols)
validate = w.get_dummies(validate, cols)
test = w.get_dummies(test, cols)

train.head()

Unnamed: 0,bathrooms,bedrooms,building_quality,home_sqft,fips,latitude,longitude,lot_sqft,regionidcity,regionidcounty,...,heating_system_Gravity,heating_system_None,heating_system_Radiant,heating_system_Solar,heating_system_Yes,land_use_Mobile Home,land_use_Single Family Residential,land_use_Townhouse,county_Orange,county_Ventura
67577,2.0,2.0,6.0,815.0,6059.0,33747500.0,-117863000.0,7140.0,47568.0,1286.0,...,0,1,0,0,0,0,0,0,1,0
5673,2.0,2.0,8.0,1117.0,6037.0,33820500.0,-118342000.0,986080.0,54722.0,3101.0,...,0,0,0,0,0,0,0,0,0,0
16346,3.0,4.0,6.0,1469.0,6037.0,33805856.0,-118124873.0,6145.0,46298.0,3101.0,...,0,0,0,0,0,0,1,0,0,0
24690,3.0,4.0,6.0,2030.0,6059.0,33606164.0,-117670415.0,8662.0,12773.0,1286.0,...,0,1,0,0,0,0,1,0,1,0
29587,1.0,2.0,6.0,864.0,6059.0,33844336.0,-117962041.0,7200.0,16764.0,1286.0,...,0,1,0,0,0,0,1,0,1,0


In [4]:
## Before clustering I'm going to create scaled versions of my train, validate, test splits
## I'll then make a copy of the train sclaed data frame to use for creating clusters

columns_to_scale = list(train.select_dtypes(include=['float64']).columns)

train_scaled, validate_scaled, test_scaled = w.scale_data(train, validate, test, columns_to_scale)

In [5]:
train_scaled.columns

Index(['bathrooms', 'bedrooms', 'building_quality', 'home_sqft', 'fips',
       'latitude', 'longitude', 'lot_sqft', 'regionidcity', 'regionidcounty',
       'regionidzip', 'rooms', 'year_built', 'structure_tax_value',
       'assessed_value', 'land_tax_value', 'tax_amount', 'censustractandblock',
       'logerror', 'transactiondate', 'age', 'heating_system_Central',
       'heating_system_Floor/Wall', 'heating_system_Forced air',
       'heating_system_Gravity', 'heating_system_None',
       'heating_system_Radiant', 'heating_system_Solar', 'heating_system_Yes',
       'land_use_Mobile Home', 'land_use_Single Family Residential',
       'land_use_Townhouse', 'county_Orange', 'county_Ventura'],
      dtype='object')

## Modeling without Clusters

In [6]:
# Choose the variables I want to use
quant_vars = ['latitude', 'longitude', 'assessed_value', 'age', 'home_sqft', 'bathrooms', 'bedrooms', 'building_quality']
cat_vars = ['county', 'fips', 'heating_system']
target = ['logerror']

mvp = ['latitude', 'longitude', 'assessed_value', 'age', 'home_sqft', 'bathrooms', 'bedrooms', 'building_quality', 'county_Orange', 'county_Ventura']
mvp

['latitude',
 'longitude',
 'assessed_value',
 'age',
 'home_sqft',
 'bathrooms',
 'bedrooms',
 'building_quality',
 'county_Orange',
 'county_Ventura']

In [8]:
#First Iteration - split into X and Y using scaled data and only select variables
X_train, y_train = train_scaled[mvp], train_scaled.logerror
X_validate, y_validate = validate_scaled[mvp], validate_scaled.assessed_value
X_test, y_test = test_scaled[mvp], test_scaled.assessed_value

In [14]:
train.logerror.mean()


0.015491782190204017

### Calculate our baseline
- Will caculate baseline based on both the mean and median and evaluate which has the lowest RMSE to decide which will be the baseline to beat.

In [11]:
#First convert y_train into Dataframes so the baselines can be easily calculated with pandas
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)
y_test =pd.DataFrame(y_test)

# Create new data frame to hold my in sample predictions and the same for out of sample
predictions_train = y_train.copy()
predictions_train = predictions_train.rename(columns={'logerror':'actual'})
predictions_validate = y_validate.copy()
predictions_validate = predictions_validate.rename(columns={'logerror':'actual'})
predictions_test = y_test.copy()
predictions_test = predictions_test.rename(columns={'logerror':'actual'})


#Calculate based on mean and add baseline predictions to a the train data frame holding predictions
baseline_mean = y_train.logerror.mean()
predictions_train['baseline_mean'] = baseline_mean
predictions_validate['baseline_mean'] = baseline_mean
predictions_test['baseline_mean'] = baseline_mean

#Calculate based on median and add baseline predictions to the train data frame holding predictions
baseline_median = y_train.logerror.median()
predictions_train['baseline_median'] = baseline_median
predictions_validate['baseline_median'] = baseline_median
predictions_test['baseline_median'] = baseline_median

#Sanity Check
predictions_train.head()

Unnamed: 0,actual,baseline_mean,baseline_median
67577,0.629947,0.602206,0.60092
5673,0.599863,0.602206,0.60092
16346,0.608938,0.602206,0.60092
24690,0.600561,0.602206,0.60092
29587,0.602402,0.602206,0.60092
