In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from prepare_zillow import fill_nulls
from prepare_zillow import wrangle_zillow
from prepare_zillow import null_dropper
from prepare_zillow import handle_outliers
from prepare_zillow import rename_cols
from prepare_zillow import dummy_var
from prepare_zillow import home_age
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors


# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

import warnings
warnings.filterwarnings("ignore")

# Wrangling
from sklearn.model_selection import train_test_split
from env import host, user, password
import acquire_zillow
from functions import split

In [2]:
df = acquire_zillow.get_zillow_data()
df.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,censustractandblock,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,14297519,,,,3.5,4.0,,,3.5,,...,60590630072012.0,0.03,2017-01-01,,,,,Single Family Residential,,
1,17052889,,,,1.0,2.0,,,1.0,,...,61110010023006.0,0.06,2017-01-01,,,,,Single Family Residential,,
2,14186244,,,,2.0,3.0,,,2.0,,...,60590218022012.0,0.01,2017-01-01,,,,,Single Family Residential,,
3,12177905,,,,3.0,4.0,,8.0,3.0,,...,60373001001006.0,-0.1,2017-01-01,,,,Central,Single Family Residential,,
4,12095076,1.0,,,3.0,4.0,,9.0,3.0,,...,60374608001014.0,-0.0,2017-01-01,Central,,,Central,Single Family Residential,,


In [None]:
df['county'] = np.where(df.fips == 6037, 'LA',
                    np.where(df.fips == 6059, 'Orange', 'Ventura')) 

In [None]:
df = wrangle_zillow(df)
df.head()

In [None]:
df.head()

In [None]:
df_new = df.drop(columns = ['parcelid', 'tax_amount', 'sqft', 'year_built', 'fips', 'bed_bath',
                                       'lot_size', 'regionidcity', 'regionidzip', 'property_desc', 'rawcensustractandblock',
                                       'structuretaxvaluedollarcnt', 'censustractandblock', 'transaction_date', 'roomcnt', 'hashottuborspa', 'assessment_year', 'propertylandusetypeid', 'fullbathcnt']) 



In [None]:
df_new.head()

In [None]:
df_new.isnull().sum()

In [None]:
df_new.columns

split dataframe

In [None]:
train, X_train, X_validate, X_test, y_train, y_validate, y_test = split(df_new, target_var='logerror')


In [None]:
train.head()

In [None]:
trafig, axs = plt.subplots(1, 3, figsize=(15, 7))

for ax, col in zip(axs, train.select_dtypes('number')):
    train[col].plot.hist(ax=ax, title=col, ec='black')

In [None]:
import seaborn as sns
# overall logerror distribution 
sns.distplot(df.logerror)
plt.title('Log Error Distribution', fontsize=20)
plt.xlabel('Log Error')

plt.show()

log error looks normally distributed 

In [None]:

plt.figure(figsize=(14, 10))
sns.heatmap(train.corr(), cmap='Greens', annot=True)
plt.title('Correlation Heatmap of All Features Including the Target', fontsize=20)
plt.show()

In [None]:
plt.figure(figsize=(14,8))
with sns.color_palette('Blues'):
    sns.barplot(x='bathroom', y='logerror', data=train)
plt.xlabel('Bathroom Count')
plt.ylabel('Log Error')
plt.title('Does bathroom count impact log error?')
plt.show()

homes with smaller bathroom count tend to have smaller log error 

In [None]:
plt.figure(figsize=(14,8))
with sns.color_palette("Blues"):
    sns.barplot(x='bedroom', y='logerror', data=train)
plt.xlabel('Bedroom Count')
plt.ylabel('Log Error')
plt.title('Does bedroom count impact log error?')
plt.show()

bedroom shows an increase in log error at higher counts 

In [None]:
train.head()

In [None]:
plt.figure(figsize=(14,8))
sns.scatterplot(x='finished_sqft', y='logerror', data=train, alpha=.4)
plt.xlabel('Finished Square Feet')
plt.ylabel('Log Error')
plt.title('Does sqft of a home impact log error?')
plt.show()

In [None]:
plt.scatter(train.logerror,train.home_value)
plt.xlabel('Logerror')
plt.ylabel('Home Value')
plt.title('Logerror vs Home Value')

Does county have have an effect on logerror?

In [None]:
sns.barplot(x='county', y='logerror', data=df)

In [None]:
from scipy import stats
alpha = 0.05
# ANOVA test between the counties
f_county, p_county = stats.f_oneway(train[train['county'] == 'Los_Angeles'].logerror,
                                    train[train['county'] == 'Orange_county'].logerror,
                                    train[train['county'] == 'Ventura_county'].logerror)

f_county, p_county

In [None]:
if p_county < alpha:
    print('We reject the null hypothesis.')
else:
    print('We fail to reject the null hypothesis.')

county does affect logerror

In [None]:
train.head()

In [None]:
sns.barplot(x='county', y='age', data=df)

In [None]:
x = train.bedroom
y = train.bathroom

alternative_hypothesis = 'bedroom count is related to bathroom count'

corr, p = stats.pearsonr(x, y)

corr, p

if p < alpha:
    print("We reject the null hypothesis")
    print("We can say that", alternative_hypothesis)
else:
    print("We fail to reject the null")

In [None]:
x = train.home_value
y = train.logerror

alternative_hypothesis = 'house value is related to logerror'

corr, p = stats.pearsonr(x, y)

corr, p

if p < alpha:
    print("We reject the null hypothesis")
    print("We can say that", alternative_hypothesis)
else:
    print("We fail to reject the null")
    
p

In [None]:
train.info()

In [None]:
train.shape

In [None]:
# scale data

scaler = MinMaxScaler()

scaler.fit(train)

train_scaled = scaler.transform(train)
validate_scaled = scaler.transform(validate)
test_scaled = scaler.transform(test)

train_scaled