In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

#libraries
import pandas as pd
import numpy as np
import seaborn as sns

# Visualizing
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
from sklearn.model_selection import learning_curve

pd.options.display.float_format = '{:20,.2f}'.format

#my libraries
import prepare
import acquire
import env
from explore import explore_univariate, exp_bivariate_categorical, exp_bivariate_continuous, exp_multivariate, exp_bivariate_categorical

#library imports
from datetime import date
from sklearn.model_selection import train_test_split

# Statistical Tests
import scipy.stats as stats

In [2]:
df = acquire.get_zillow_data()
df.head(2)

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,propertylandusetypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,id,basementsqft,...,id.1,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,14297519,,,261.0,,,,,1727539,,...,0,0.03,2017-01-01,,,,,Single Family Residential,,
1,17052889,,,261.0,,,,,1387261,,...,1,0.06,2017-01-01,,,,,Single Family Residential,,


In [None]:
df.shape

In [3]:
def handle_missing_values(df, prop_required_column = .6, prop_required_row = .75):
#function that will drop rows or columns based on the percent of values that are missing:\
#handle_missing_values(df, prop_required_column, prop_required_row
    threshold = int(round(prop_required_column*len(df.index),0))
    df = df.dropna(axis=1, thresh=threshold)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df

In [4]:
def remove_columns(df, cols_to_remove):
#remove columns not needed
    df = df.drop(columns=cols_to_remove)
    return df

In [5]:

def wrangle_zillow(df):
    # Restrict df to only properties that meet single use criteria
    single_use = [261, 262, 263, 264, 266, 268, 273, 276, 279]
    df = df[df.propertylandusetypeid.isin(single_use)]

    # Restrict df to only those properties with at least 1 bath & bed and 350 sqft area
    df = df[(df.bedroomcnt > 0) & (df.bathroomcnt > 0) & ((df.unitcnt<=1)|df.unitcnt.isnull())\
            & (df.calculatedfinishedsquarefeet>350)]

    # Handle missing values i.e. drop columns and rows based on a threshold
    df = handle_missing_values(df)

    # Add column for counties
    df['county'] = df['fips'].apply(
        lambda x: 'Los Angeles' if x == 6037\
        else 'Orange' if x == 6059\
        else 'Ventura')

    # drop unnecessary columns
    dropcols = ['parcelid',
         'calculatedbathnbr',
         'finishedsquarefeet12',
         'fullbathcnt',
         'heatingorsystemtypeid',
         'propertycountylandusecode',
         'propertylandusetypeid',
         'propertyzoningdesc',
         'censustractandblock',
         'propertylandusedesc']

    df = remove_columns(df, dropcols)

    # replace nulls in unitcnt with 1
    df.unitcnt.fillna(1, inplace = True)

    # assume that since this is Southern CA, null means 'None' for heating system
    df.heatingorsystemdesc.fillna('None', inplace = True)

    # replace nulls with median values for select columns
    df.lotsizesquarefeet.fillna(7313, inplace = True)
    df.buildingqualitytypeid.fillna(6.0, inplace = True)

    # Columns to look for outliers
    df = df[df.taxvaluedollarcnt < 5_000_000]
    df = df[df.calculatedfinishedsquarefeet < 8000]

    # Just to be sure we caught all nulls, drop them here
    df = df.dropna()

    return df

In [6]:
df = wrangle_zillow(df)
df.head()

Unnamed: 0,id,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedfinishedsquarefeet,fips,latitude,longitude,lotsizesquarefeet,rawcensustractandblock,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,id.1,logerror,transactiondate,heatingorsystemdesc,county
3,2288172,3.0,4.0,8.0,2376.0,6037.0,34245180.0,-118240722.0,13038.0,60373001.0,...,108918.0,145143.0,2016.0,36225.0,1777.51,3,-0.1,2017-01-01,Central,Los Angeles
4,1970746,3.0,3.0,8.0,1312.0,6037.0,34185120.0,-118414640.0,278581.0,60371236.01,...,73681.0,119407.0,2016.0,45726.0,1533.89,4,0.01,2017-01-01,Central,Los Angeles
6,781532,3.0,4.0,9.0,2962.0,6037.0,34145202.0,-118179824.0,63000.0,60374608.0,...,276684.0,773303.0,2016.0,496619.0,9516.26,6,-0.0,2017-01-01,Central,Los Angeles
8,1246926,3.0,4.0,9.0,3039.0,6037.0,33960230.0,-118006914.0,20028.0,60375002.02,...,177527.0,220583.0,2016.0,43056.0,3104.19,8,-0.04,2017-01-02,Central,Los Angeles
9,1585097,3.0,2.0,8.0,1290.0,6037.0,33998800.0,-118416000.0,54048.0,60372751.02,...,151303.0,371361.0,2016.0,220058.0,4557.32,9,-0.04,2017-01-02,Central,Los Angeles


In [7]:
df.shape

(41994, 26)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41994 entries, 3 to 77578
Data columns (total 26 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            41994 non-null  int64  
 1   bathroomcnt                   41994 non-null  float64
 2   bedroomcnt                    41994 non-null  float64
 3   buildingqualitytypeid         41994 non-null  float64
 4   calculatedfinishedsquarefeet  41994 non-null  float64
 5   fips                          41994 non-null  float64
 6   latitude                      41994 non-null  float64
 7   longitude                     41994 non-null  float64
 8   lotsizesquarefeet             41994 non-null  float64
 9   rawcensustractandblock        41994 non-null  float64
 10  regionidcity                  41994 non-null  float64
 11  regionidcounty                41994 non-null  float64
 12  regionidzip                   41994 non-null  float64
 13  r

In [None]:
df = prepare.clean_data(df)
df.head(2)

In [None]:
df.info()

In [None]:
prepare.get_hist(df)

In [None]:
prepare.get_box(df)

In [None]:
df = prepare.remove_outliers(df, k = 1.5 , col_list = ['bathrooms', 'bedrooms', 'tax_value', 'tax_rate', 'square_feet', 'age', 'log_error'])
df.head()

In [None]:
prepare.get_hist(df)

In [None]:
prepare.get_box(df)

In [None]:
train, validate, test = prepare.train_validate_test_split(df)

In [None]:
print("train observations: ", train.size)
print("validate observations: ", validate.size)
print("test observations: ", test.size)

# Explore
- We are not going to explore the scaled data at this time, but it is important that the data is scaled before moving into clustering.

- Target Variable: 'log_error'

In [None]:
#What are the distributions of each variable (train)
for col in train.columns:
    plt.figure(figsize=(4,2))
    plt.hist(train[col])
    plt.title(col)
    plt.show()

##### Takeawyas
- right skewed tax_value, square_feet, and tax_rate
- bit of a left skew on age
- log_error normally distributed

In [None]:
#Does log error differ across counties?
sns.boxplot(train.county_code, train.log_error)
plt.title("Is there a difference in log_error across counties")
plt.show()

In [None]:
#Is there a relationship between log_error and bathrooms?
print("Is there a relationship between log_error and bathrooms?")
sns.jointplot(x = 'bathrooms', y = 'log_error', data=train)
plt.xlabel("Bathrooms")
plt.ylabel("Log Error")
plt.show()

In [None]:
#Is there a relationship between log_error and bedrooms?
# plot age by spending_score
plt.scatter(train.bedrooms, train.log_error)
plt.xlabel("bedrooms")
plt.ylabel("Log Error")
plt.title("Is there a relationship between log_error and bedrooms?")
plt.show()

In [None]:
#Is there a relationship between log_error and square feet?
plt.scatter(train.square_feet, train.log_error)
plt.xlabel("Square Footage")
plt.ylabel("Log Error")
plt.title("Is there a relationship between log_error and bedrooms?")
plt.show()

In [None]:
#Is there a relationship between log_error and tax_value?
plt.scatter(train.tax_value, train.log_error)
plt.xlabel("Home Value")
plt.ylabel("Log Error")
plt.title("Is there a relationship between log_error and bedrooms?")
plt.show()

In [None]:
train['le_bin'] = pd.cut(train.log_error, 4)

train.head()

In [None]:
# box plot of the two groups and spending score
sns.boxplot(train.le_bin, train.tax_value)
plt.show()

In [None]:
sns.boxplot(train.le_bin, train.square_feet)
        

In [None]:
sns.boxplot(train.le_bin, train.age)

In [None]:
print("Interaction of variables along with younger vs older\n(cutoff=40)")
sns.pairplot(train, hue='le_bin')
plt.show()