In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

# houses my function to connect to Codeup DB
import wrangle as wr
import explore as ex

Load in successful, awaiting commands...


**Exercises:**

- Only include properties with a transaction in 2017
    - include only the last transaction for each property 
    - zestimate error
    - date of transaction
- Only include properties that have a latitude and longitude value

**Domain Notes:**

1. Are there requirements for bedrooms for single family units?
* Yes, there is a minimum amount of bedrooms required for single family residential in Los Angeles County. According to the Los Angeles County Building Code, a single family dwelling must have at least one habitable room that is at least 120 square feet in area and has a minimum dimension of 7 feet in any direction. Additionally, the dwelling must have at least one bedroom for every two occupants, with a minimum of one bedroom per dwelling unit. 
    * Specifically, the requirements for habitable rooms and bedrooms can be found in Section 1208.4 and Section 1208.5, respectively. https://dpw.lacounty.gov/bsd/building-code/
    
2. Are there requirements for bathrooms for single family units?
* According to the Los Angeles County Building Code, a single family dwelling must have at least one bathroom that includes a toilet, sink, and bathtub or shower.
    * Specifically, the requirements for bathrooms can be found in Section 1208.2. Los angeles county adopted the california plumbing code requirements.
        * LA County: https://dpw.lacounty.gov/bsd/building-code/
        * CA Plumbing Code: 
    
3. Are there lot size requirements for a single family home?
* The lot size range is 1,500 - 5,000 square feet
    * https://planning.lacounty.gov/zoning-ordinance/
    
  

# Data Acquisition

**Acquire:**

* Data acquired from mySQL Codeup Server using env.py credentials
* It contained 52,442 rows and 7 columns before cleaning/encoding
* Each row represents a property
* Each column represents a feature


In [2]:
# set query to SQL using domain knowledge notes from above
query = """SELECT *
FROM properties_2017
JOIN predictions_2017 using (parcelid)
LEFT JOIN airconditioningtype using (airconditioningtypeid)
LEFT JOIN architecturalstyletype using (architecturalstyletypeid)
LEFT JOIN buildingclasstype using (buildingclasstypeid)
LEFT JOIN heatingorsystemtype using (heatingorsystemtypeid)
LEFT JOIN storytype using (storytypeid)
LEFT JOIN typeconstructiontype using (typeconstructiontypeid)
WHERE propertylandusetypeid like 260 or 261 or 262 or 263 or 264 or 265 or 266 or 268 or 269 or 270 or 275 or 276 or 279
and lotsizesquarefeet >= 1500
and bathroomcnt > 1
and bedroomcnt > 1
ORDER BY parcelid;"""

In [3]:
# acquire
df = wr.get_data('zillow', query)

CSV file found and loaded


In [4]:
# look at the data
df.head()

Unnamed: 0,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,parcelid,id,basementsqft,bathroomcnt,...,censustractandblock,id.1,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,storydesc,typeconstructiondesc
0,,,2.0,,,,10711855,1087254,,2.0,...,60371130000000.0,55006,-0.007357,2017-07-07,,,,Central,,
1,,,2.0,,,1.0,10711877,1072280,,2.0,...,60371130000000.0,71382,0.021066,2017-08-29,Central,,,Central,,
2,,,2.0,,,1.0,10711888,1340933,,2.0,...,60371130000000.0,23209,0.077174,2017-04-04,Central,,,Central,,
3,,,2.0,,,,10711910,1878109,,2.0,...,60371130000000.0,18017,-0.041238,2017-03-17,,,,Central,,
4,,,2.0,,,,10711923,2190858,,2.0,...,60371130000000.0,20378,-0.009496,2017-03-24,,,,Central,,


In [5]:
# checking types and nulls
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77614 entries, 0 to 77613
Data columns (total 68 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   typeconstructiontypeid        223 non-null    float64
 1   storytypeid                   50 non-null     float64
 2   heatingorsystemtypeid         49572 non-null  float64
 3   buildingclasstypeid           15 non-null     float64
 4   architecturalstyletypeid      207 non-null    float64
 5   airconditioningtypeid         25007 non-null  float64
 6   parcelid                      77614 non-null  int64  
 7   id                            77614 non-null  int64  
 8   basementsqft                  50 non-null     float64
 9   bathroomcnt                   77580 non-null  float64
 10  bedroomcnt                    77580 non-null  float64
 11  buildingqualitytypeid         49810 non-null  float64
 12  calculatedbathnbr             76964 non-null  float64
 13  d

In [6]:
# shape of df
df.shape

(77614, 68)

In [7]:
# numerical descriptive stats transposed to see all columns
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
typeconstructiontypeid,223.0,6.040359,0.5560348,4.0,6.0,6.0,6.0,13.0
storytypeid,50.0,7.0,0.0,7.0,7.0,7.0,7.0,7.0
heatingorsystemtypeid,49572.0,3.921811,3.59477,1.0,2.0,2.0,7.0,24.0
buildingclasstypeid,15.0,3.933333,0.2581989,3.0,4.0,4.0,4.0,4.0
architecturalstyletypeid,207.0,7.386473,2.72803,2.0,7.0,7.0,7.0,21.0
airconditioningtypeid,25007.0,1.812013,2.965768,1.0,1.0,1.0,1.0,13.0
parcelid,77614.0,13007810.0,3518694.0,10711860.0,11538210.0,12530060.0,14211000.0,167689300.0
id,77614.0,1496056.0,861344.8,349.0,752595.2,1499186.0,2242084.0,2985182.0
basementsqft,50.0,679.72,689.7035,38.0,273.0,515.0,796.5,3560.0
bathroomcnt,77580.0,2.298492,0.9967259,0.0,2.0,2.0,3.0,18.0


**Acquire Notes:**

Wow. What a mess. 

* Remove buildingclasstypeid, finishedsquarefeet13, finishedsquarefeet15, buildingclassdesc off the bat because these columns contain no information at all.


**Exercise:**
- Only include properties with a transaction in 2017
    - include only the last transaction for each property
    - zestimate error
    - date of transaction
- Only include properties that have a latitude and longitude value

# Data Preperation

**Prepare:**
- Remove columns:
    * with no data
    * 'basementsqft'
    * 'typeconstructiontypeid',
    * 'storytypeid',
    * 'heatingorsystemtypeid',
    * 'architecturalstyletypeid',
    * 'airconditioningtypeid',
    
    
- Remove transactions from 2018
- Check duplicates
- Zestimate Error
- Date not null
- Lat and Long are populated

**Impute These Columns:**

'buildingqualitytypeid',
'calculatedbathnbr',
'decktypeid',
'finishedfloor1squarefeet',
'calculatedfinishedsquarefeet',
'finishedsquarefeet12',
'finishedsquarefeet50',
'finishedsquarefeet6',
'fips',
'fireplacecnt',
'fullbathcnt',
'garagecarcnt',
'garagetotalsqft',
'hashottuborspa',
'latitude',
'longitude',
'lotsizesquarefeet',
'poolcnt',
'poolsizesum',
'pooltypeid10',
'pooltypeid2',
'pooltypeid7',
'propertycountylandusecode',
'propertylandusetypeid',
'propertyzoningdesc',
'rawcensustractandblock',
'regionidcity',
'regionidcounty',
'regionidneighborhood',
'regionidzip',
'roomcnt',
'threequarterbathnbr',
'unitcnt',
'yardbuildingsqft17',
'yardbuildingsqft26',
'yearbuilt',
'numberofstories',
'fireplaceflag',
'structuretaxvaluedollarcnt',
'taxvaluedollarcnt',
'assessmentyear',
'landtaxvaluedollarcnt',
'taxamount',
'taxdelinquencyflag',
'taxdelinquencyyear',
'censustractandblock',
'id',
'logerror',
'transactiondate',
'airconditioningdesc',
'architecturalstyledesc',
'heatingorsystemdesc',
'storydesc',
'typeconstructiondesc'


In [8]:
# remove columns with no data
df.drop(columns=['buildingclasstypeid','finishedsquarefeet13','finishedsquarefeet15','buildingclassdesc'], inplace=True)
df.head()

Unnamed: 0,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,architecturalstyletypeid,airconditioningtypeid,parcelid,id,basementsqft,bathroomcnt,bedroomcnt,...,taxdelinquencyyear,censustractandblock,id.1,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,heatingorsystemdesc,storydesc,typeconstructiondesc
0,,,2.0,,,10711855,1087254,,2.0,3.0,...,,60371130000000.0,55006,-0.007357,2017-07-07,,,Central,,
1,,,2.0,,1.0,10711877,1072280,,2.0,4.0,...,,60371130000000.0,71382,0.021066,2017-08-29,Central,,Central,,
2,,,2.0,,1.0,10711888,1340933,,2.0,4.0,...,,60371130000000.0,23209,0.077174,2017-04-04,Central,,Central,,
3,,,2.0,,,10711910,1878109,,2.0,3.0,...,,60371130000000.0,18017,-0.041238,2017-03-17,,,Central,,
4,,,2.0,,,10711923,2190858,,2.0,4.0,...,,60371130000000.0,20378,-0.009496,2017-03-24,,,Central,,


In [9]:
# remove transactions from 2018 - there are none
df[df.transactiondate == '%2018%']

Unnamed: 0,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,architecturalstyletypeid,airconditioningtypeid,parcelid,id,basementsqft,bathroomcnt,bedroomcnt,...,taxdelinquencyyear,censustractandblock,id.1,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,heatingorsystemdesc,storydesc,typeconstructiondesc


In [10]:
# check duplicates for parcelid - there are 122!
df[df.parcelid.duplicated()]

Unnamed: 0,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,architecturalstyletypeid,airconditioningtypeid,parcelid,id,basementsqft,bathroomcnt,bedroomcnt,...,taxdelinquencyyear,censustractandblock,id.1,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,heatingorsystemdesc,storydesc,typeconstructiondesc
294,,,2.0,,,10722858,16179,,2.0,4.0,...,,6.037135e+13,14034,-0.172843,2017-07-28,,,Central,,
540,,,2.0,,,10732347,1836115,,2.0,4.0,...,,6.037137e+13,13914,-0.221145,2017-07-25,,,Central,,
722,,,2.0,,1.0,10739478,2119208,,3.0,4.0,...,,6.037800e+13,2905,-0.262967,2017-03-31,Central,,Central,,
844,,,2.0,,,10744507,1836165,,3.0,4.0,...,,6.037800e+13,28611,-0.050062,2017-08-31,,,Central,,
1088,,,2.0,,1.0,10753427,1403445,,2.0,2.0,...,,6.037800e+13,3540,-0.145781,2017-03-17,Central,,Central,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76227,,,,,,17251843,1731266,,2.0,2.0,...,,6.111008e+13,29466,-2.922919,2017-06-22,,,,,
76946,,,,,,17280166,2595690,,3.0,4.0,...,,6.111007e+13,38851,-0.116808,2017-06-15,,,,,
77004,,,,,,17282392,2938730,,2.0,3.0,...,,6.111007e+13,64278,0.901074,2017-08-25,,,,,
77418,,,,,,17295416,2506407,,2.0,2.0,...,,6.111006e+13,4530,-0.189044,2017-05-16,,,,,


In [11]:
# keeping the last duplicates for parcelid
df[df.parcelid.duplicated()].sort_values(by='parcelid')
df.drop_duplicates(subset=['parcelid'], keep='last', inplace=True)

In [12]:
# verifying drop of dupes
df.parcelid.duplicated().sum()

0

In [13]:
# look at shape
df.shape

(77414, 64)

In [14]:
# check zestimate error has values
df.logerror.isnull().sum()

0

In [15]:
# check that transaction date has values
df.transactiondate.isnull().sum()

0

In [16]:
# check that lat/long has values - 33 nulls
df.longitude.isnull().sum()

33

In [17]:
# longitude nulls - drop all
df.longitude.isnull().sum()

33

In [18]:
# use the function to look at nulls per row 

# set variables first

num_missing = df.isnull().sum(axis=1)
pct_miss = ((num_missing / df.shape[1]) * 100).round(0)

In [19]:
# making it readable by using a dataframe
rows_missing = pd.DataFrame({'parcelid':df.parcelid,'num_cols_missing': num_missing, 'percent_cols_missing': pct_miss})

In [20]:
# look at the info
rows_missing.sort_values(by="num_cols_missing", ascending=False)

Unnamed: 0,parcelid,num_cols_missing,percent_cols_missing
28369,12006414,59,92.0
18797,11510663,59,92.0
26460,11905737,59,92.0
26461,11905738,59,92.0
29212,12038488,59,92.0
...,...,...,...
63838,14466342,20,31.0
56263,14128839,20,31.0
57754,14193288,20,31.0
58315,14215623,19,30.0


In [21]:
# make new features for outdoors then drop those columns
df['outdoor_features'] = np.where(((df.decktypeid.notna()) | (df.hashottuborspa.notna())\
    | (df.poolcnt.notna()) | (df.pooltypeid10.notna()) | (df.pooltypeid2.notna())\
    | (df.pooltypeid7.notna()) | (df.garagecarcnt.notna()) | ((df.garagetotalsqft > 0)\
    & (df.garagetotalsqft < 1000) | (df.yardbuildingsqft17.notna())\
    | (df.yardbuildingsqft26.notna()))), 1, 0)

In [22]:
df.outdoor_features.value_counts(ascending=False)

0    40199
1    37215
Name: outdoor_features, dtype: int64

In [23]:
# drop the columns used to make the outdoor_features column
df = df.drop(columns={'decktypeid', 'hashottuborspa', 'poolcnt', 'pooltypeid10',\
            'pooltypeid2','pooltypeid7', 'garagecarcnt','garagetotalsqft',\
            'yardbuildingsqft17', 'yardbuildingsqft26'})

In [24]:
# drop all relative columns to the target variable of assessed worth
df = df.drop(columns={'structuretaxvaluedollarcnt',
 'assessmentyear',
 'landtaxvaluedollarcnt',
 'taxamount',
 'taxdelinquencyflag',
 'taxdelinquencyyear'})

In [25]:
# 7 indicates a basement
df.storytypeid.notna().sum() 

50

In [26]:
# matches with the basement sqft, can combine into one column, has basement
df.basementsqft.notna().sum()

50

In [27]:
# create new column for has basement
df['basement'] = np.where(((df.basementsqft.notna()) & (df.storytypeid.notna()))\
    | (df.basementsqft.notna()) | (df.storytypeid.notna()), 1, 0)

In [28]:
# drop the columns used to make basement column
df = df.drop(columns={'basementsqft','storytypeid','basementsqft','storytypeid','storydesc'})

In [29]:
# create new column for heat 
df['heat'] = np.where((df.heatingorsystemtypeid.notna()) \
                & (df.heatingorsystemdesc != 'None'), 1, 0)

In [30]:
# delete the columns used for heat column
df = df.drop(columns={'heatingorsystemtypeid','heatingorsystemdesc'})

In [31]:
# create new column for AC
df['ac'] = np.where((df.airconditioningdesc != 'None')\
                    & (df.airconditioningtypeid.notna()), 1, 0)

In [32]:
# drop columns used to make AC column
df = df.drop(columns={'airconditioningdesc','airconditioningtypeid'})

In [33]:
# make new column for indoor features
df['indoor_features'] = np.where((df.fireplacecnt > 0) |(df.fireplaceflag.notna()), 1, 0)

In [34]:
# delete columns used to make indoor features column
df = df.drop(columns={'fireplacecnt', 'fireplaceflag'})

In [35]:
# defines the BEST home quality
df.buildingqualitytypeid.min()

1.0

In [36]:
# defines the WORST home quality
df.buildingqualitytypeid.max()

12.0

In [37]:
# make into a new column called quality
df['quality'] = (np.arange(0, len(df)))

In [38]:
# fill new quality column with values
df['quality'] = df['buildingqualitytypeid'].apply(ex.assign_quality)

In [39]:
# drop columns used to create new column
df = df.drop(columns={'buildingqualitytypeid'})

In [40]:
# make new column for stories (one story or greater)
df['one_story'] = np.where((df.numberofstories == 1) |\
                           (df.numberofstories.isnull() == True), 1, 0)

In [41]:
# drop the column used to make new column 
df = df.drop(columns={'numberofstories'})

In [46]:
# # mass removal of columns
# def remove_columns(df, cols_to_remove):
#     """
#     This function will:
#     - take in a df and list of columns
#     - drop the listed columns
#     - return the new df
#     """
#     df = df.drop(columns=cols_to_remove)
#     return df

In [43]:
# these columns are being dropped for various reasons: contain no info, are duplicates, made
# redundant by new features.
df = df.drop(columns={'typeconstructiondesc','architecturalstyletypeid','architecturalstyledesc',\
            'id','finishedsquarefeet12','finishedsquarefeet50','finishedsquarefeet6',\
                'poolsizesum','roomcnt', 'finishedfloor1squarefeet', 'typeconstructiontypeid',\
                'propertylandusetypeid', 'propertycountylandusecode',\
                'propertyzoningdesc'})

In [44]:
# these are the columns left after initial clean up. Now to fill the nulls OR drop them
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77414 entries, 0 to 77613
Data columns (total 30 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      77414 non-null  int64  
 1   bathroomcnt                   77381 non-null  float64
 2   bedroomcnt                    77381 non-null  float64
 3   calculatedbathnbr             76772 non-null  float64
 4   calculatedfinishedsquarefeet  77185 non-null  float64
 5   fips                          77381 non-null  float64
 6   fullbathcnt                   76772 non-null  float64
 7   latitude                      77381 non-null  float64
 8   longitude                     77381 non-null  float64
 9   lotsizesquarefeet             69142 non-null  float64
 10  rawcensustractandblock        77381 non-null  float64
 11  regionidcity                  75910 non-null  float64
 12  regionidcounty                77381 non-null  float64
 13  r

In [48]:
# handles large df with defaults to remove if over threshold
def handle_missing_values(df, prop_required_columns=0.5, prop_required_rows=0.75):
    """
    This function will:
    - take in: 
        - a dataframe
        - column threshold (defaulted to 0.5)
        - row threshold (defaulted to 0.75)
    - calculates the minimum number of non-missing values required for each column/row to be retained
    - drops columns/rows with a high proportion of missing values.
    - returns the new df
    """
    
    column_threshold = int(round(prop_required_columns * len(df.index), 0))
    df = df.dropna(axis=1, thresh=column_threshold)
    
    row_threshold = int(round(prop_required_rows * len(df.columns), 0))
    df = df.dropna(axis=0, thresh=row_threshold)
    
    return df

In [52]:
df_clean = handle_missing_values(df)

In [53]:
df_clean.head()

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,fips,fullbathcnt,latitude,longitude,lotsizesquarefeet,...,id.1,logerror,transactiondate,outdoor_features,basement,heat,ac,indoor_features,quality,one_story
0,10711855,2.0,3.0,2.0,2107.0,6037.0,2.0,34222559.0,-118617387.0,9158.0,...,55006,-0.007357,2017-07-07,1,0,1,0,0,medium,1
1,10711877,2.0,4.0,2.0,1882.0,6037.0,2.0,34220261.0,-118616409.0,9035.0,...,71382,0.021066,2017-08-29,1,0,1,1,0,medium,1
2,10711888,2.0,4.0,2.0,1882.0,6037.0,2.0,34222491.0,-118616854.0,9800.0,...,23209,0.077174,2017-04-04,0,0,1,1,0,medium,1
3,10711910,2.0,3.0,2.0,1477.0,6037.0,2.0,34221864.0,-118615739.0,11285.0,...,18017,-0.041238,2017-03-17,1,0,1,0,0,medium,1
4,10711923,2.0,4.0,2.0,1918.0,6037.0,2.0,34220619.0,-118615253.0,11239.0,...,20378,-0.009496,2017-03-24,1,0,1,0,0,medium,1


In [54]:
# # not used here but will use later
# def data_prep(df, col_to_remove=[], prop_required_columns=0.5, prop_required_rows=0.75):
#     """
#     This function will:
#     - take in: 
#         - a dataframe
#         - list of columns
#         - column threshold (defaulted to 0.5)
#         - row threshold (defaulted to 0.75)
#     - removes unwanted columns
#     - remove rows and columns that contain a high proportion of missing values
#     - returns cleaned df
#     """
#     df = remove_columns(df, col_to_remove)
#     df = handle_missing_values(df, prop_required_columns, prop_required_rows)
#     return df