In [36]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from datetime import date

import acq
import env

# Exploring
import scipy.stats as stats
from sklearn.model_selection import train_test_split

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

In [2]:
df = acq.get_zillow_data()
df.head(2)

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,propertylandusetypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,id,basementsqft,...,id.1,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,14297519,,,261.0,,,,,1727539,,...,0,0.03,2017-01-01,,,,,Single Family Residential,,
1,17052889,,,261.0,,,,,1387261,,...,1,0.06,2017-01-01,,,,,Single Family Residential,,


In [3]:
df = df.sample(5000)
df.head(2)

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,propertylandusetypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,id,basementsqft,...,id.1,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
1113,11871192,,,266.0,2.0,,,1.0,2361444,,...,1115,0.01,2017-01-05,Central,,,Central,Condominium,,
75345,14442954,,,261.0,,,,,877590,,...,75379,0.03,2017-09-11,,,,,Single Family Residential,,


In [4]:
df.shape

(5000, 69)

In [5]:
def handle_missing_values(df, prop_required_column = .5, prop_required_row = .70):
#function that will drop rows or columns based on the percent of values that are missing:\
#handle_missing_values(df, prop_required_column, prop_required_row
    threshold = int(round(prop_required_column*len(df.index),0))
    df = df.dropna(axis=1, thresh=threshold)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df

In [6]:
df = handle_missing_values(df, prop_required_column = .5, prop_required_row = .70)
df.head()

Unnamed: 0,parcelid,propertylandusetypeid,heatingorsystemtypeid,id,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,...,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,id.1,logerror,transactiondate,heatingorsystemdesc,propertylandusedesc
1113,11871192,266.0,2.0,2361444,1.0,1.0,8.0,1.0,795.0,795.0,...,282954.0,2016.0,159196.0,3478.4,60372013022000.0,1115,0.01,2017-01-05,Central,Condominium
75345,14442954,261.0,,877590,2.5,3.0,,2.5,1825.0,1825.0,...,845908.0,2016.0,637793.0,9695.64,60590423392002.0,75379,0.03,2017-09-11,,Single Family Residential
10529,14667466,266.0,,788639,1.0,1.0,,1.0,678.0,678.0,...,294422.0,2016.0,218957.0,3732.56,60590756054012.0,10538,-0.04,2017-02-16,,Condominium
26337,11488432,261.0,2.0,1077439,3.0,4.0,9.0,3.0,2891.0,2891.0,...,1293500.0,2016.0,969700.0,15250.74,60376205222002.0,26350,0.01,2017-04-14,Central,Single Family Residential
76037,12682581,266.0,2.0,192882,4.0,4.0,11.0,4.0,2520.0,2520.0,...,870000.0,2016.0,348400.0,9890.2,60376513021011.0,76071,0.06,2017-09-13,Central,Condominium


In [7]:
df.shape

(4998, 35)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4998 entries, 1113 to 41622
Data columns (total 35 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      4998 non-null   int64  
 1   propertylandusetypeid         4998 non-null   float64
 2   heatingorsystemtypeid         3162 non-null   float64
 3   id                            4998 non-null   int64  
 4   bathroomcnt                   4998 non-null   float64
 5   bedroomcnt                    4998 non-null   float64
 6   buildingqualitytypeid         3181 non-null   float64
 7   calculatedbathnbr             4961 non-null   float64
 8   calculatedfinishedsquarefeet  4983 non-null   float64
 9   finishedsquarefeet12          4758 non-null   float64
 10  fips                          4998 non-null   float64
 11  fullbathcnt                   4961 non-null   float64
 12  latitude                      4998 non-null   float64
 13 

In [9]:
 df['heatingorsystemtypeid'] = df.heatingorsystemtypeid.fillna(value = df['heatingorsystemtypeid'].mean())

In [10]:
 df['buildingqualitytypeid'] = df.buildingqualitytypeid.fillna(value = df['buildingqualitytypeid'].mean())

In [11]:
 df['calculatedbathnbr'] = df.calculatedbathnbr.fillna(value = df['calculatedbathnbr'].mean())

In [12]:
 df['calculatedfinishedsquarefeet'] = df.calculatedfinishedsquarefeet.fillna(value = df['calculatedfinishedsquarefeet'].mean())

In [13]:
 df['finishedsquarefeet12'] = df.finishedsquarefeet12.fillna(value = df['finishedsquarefeet12'].mean())

In [14]:
 df['fullbathcnt'] = df.fullbathcnt.fillna(value = df['fullbathcnt'].mean())

In [15]:
 df['lotsizesquarefeet'] = df.lotsizesquarefeet.fillna(value = df['lotsizesquarefeet'].mean())

In [16]:
df = df.drop(columns=['propertyzoningdesc'])

In [17]:
df['regionidcity'] = df.regionidcity.fillna(value = df['regionidcity'].mode())

In [18]:
df['regionidzip'] = df.regionidzip.fillna(value = df['regionidzip'].mode())

In [19]:
df['unitcnt'] = df.unitcnt.fillna(value = df['unitcnt'].mean())

In [20]:
df['yearbuilt'] = df.yearbuilt.fillna(value = df['yearbuilt'].mode())

In [21]:
df['structuretaxvaluedollarcnt'] = df.structuretaxvaluedollarcnt.fillna(value = df['structuretaxvaluedollarcnt'].mean())

In [22]:
df['censustractandblock'] = df.censustractandblock.fillna(value = df['censustractandblock'].mean())

In [23]:
df = df.drop(columns=['heatingorsystemdesc'])

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4998 entries, 1113 to 41622
Data columns (total 33 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      4998 non-null   int64  
 1   propertylandusetypeid         4998 non-null   float64
 2   heatingorsystemtypeid         4998 non-null   float64
 3   id                            4998 non-null   int64  
 4   bathroomcnt                   4998 non-null   float64
 5   bedroomcnt                    4998 non-null   float64
 6   buildingqualitytypeid         4998 non-null   float64
 7   calculatedbathnbr             4998 non-null   float64
 8   calculatedfinishedsquarefeet  4998 non-null   float64
 9   finishedsquarefeet12          4998 non-null   float64
 10  fips                          4998 non-null   float64
 11  fullbathcnt                   4998 non-null   float64
 12  latitude                      4998 non-null   float64
 13 

In [25]:
df = df.drop(columns=['regionidcity', 'regionidcounty', 'regionidzip'])

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4998 entries, 1113 to 41622
Data columns (total 30 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      4998 non-null   int64  
 1   propertylandusetypeid         4998 non-null   float64
 2   heatingorsystemtypeid         4998 non-null   float64
 3   id                            4998 non-null   int64  
 4   bathroomcnt                   4998 non-null   float64
 5   bedroomcnt                    4998 non-null   float64
 6   buildingqualitytypeid         4998 non-null   float64
 7   calculatedbathnbr             4998 non-null   float64
 8   calculatedfinishedsquarefeet  4998 non-null   float64
 9   finishedsquarefeet12          4998 non-null   float64
 10  fips                          4998 non-null   float64
 11  fullbathcnt                   4998 non-null   float64
 12  latitude                      4998 non-null   float64
 13 

In [29]:
df['age'] = date.today().year - df.yearbuilt
df = df.drop(columns=['yearbuilt'])

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4998 entries, 1113 to 41622
Data columns (total 30 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      4998 non-null   int64  
 1   propertylandusetypeid         4998 non-null   float64
 2   heatingorsystemtypeid         4998 non-null   float64
 3   id                            4998 non-null   int64  
 4   bathroomcnt                   4998 non-null   float64
 5   bedroomcnt                    4998 non-null   float64
 6   buildingqualitytypeid         4998 non-null   float64
 7   calculatedbathnbr             4998 non-null   float64
 8   calculatedfinishedsquarefeet  4998 non-null   float64
 9   finishedsquarefeet12          4998 non-null   float64
 10  fips                          4998 non-null   float64
 11  fullbathcnt                   4998 non-null   float64
 12  latitude                      4998 non-null   float64
 13 

In [31]:
df['age'] = df.age.fillna(value = df['age'].mean())

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4998 entries, 1113 to 41622
Data columns (total 30 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      4998 non-null   int64  
 1   propertylandusetypeid         4998 non-null   float64
 2   heatingorsystemtypeid         4998 non-null   float64
 3   id                            4998 non-null   int64  
 4   bathroomcnt                   4998 non-null   float64
 5   bedroomcnt                    4998 non-null   float64
 6   buildingqualitytypeid         4998 non-null   float64
 7   calculatedbathnbr             4998 non-null   float64
 8   calculatedfinishedsquarefeet  4998 non-null   float64
 9   finishedsquarefeet12          4998 non-null   float64
 10  fips                          4998 non-null   float64
 11  fullbathcnt                   4998 non-null   float64
 12  latitude                      4998 non-null   float64
 13 

In [38]:
def train_validate_test_split(df):
    train_validate, test = train_test_split(df, test_size=0.2, random_state=123, stratify=df['logerror'])
    train, validate = train_test_split(train_validate, test_size=0.3, random_state=123, stratify=train_validate['logerror'])
    return train, validate, test

In [39]:
df = train_validate_test_split(df)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

1. Ask at least 5 questions about the data, keeping in mind that your target variable is logerror. e.g. Is logerror significantly different for properties in LA County vs Orange County vs Ventura County?

2. Answer those questions through a mix of statistical tests and visualizations.