In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np

# Exploring
import scipy.stats as stats

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Acquire data from mySQL using the python module to connect and query. You will want to end with a single dataframe. Make sure to include: the logerror, all fields related to the properties that are available. You will end up using all the tables in the database.

- Be sure to do the correct join (inner, outer, etc.). We do not want to eliminate properties purely because they may have a null value for airconditioningtypeid.
- Only include properties with a transaction in 2017, and include only the last transaction for each property (so no duplicate property ID's), along with zestimate error and date of transaction.
- Only include properties that include a latitude and longitude value.

In [2]:
# import env file for hostname, username, password, and db_name
from env import host, user, password, db_name

In [3]:
# Pass env file authentication to container 'url'
url = f'mysql+pymysql://{user}:{password}@{host}/{db_name}'

In [4]:
# define sql search for all records from all tables in Zillow database
sql = """
SELECT prop.*,
       pred.logerror, 
       pred.transactiondate, 
       air.airconditioningdesc, 
       arch.architecturalstyledesc, 
       build.buildingclassdesc, 
       heat.heatingorsystemdesc, 
       landuse.propertylandusedesc, 
       story.storydesc, 
       construct.typeconstructiondesc 
FROM   properties_2017 prop  
JOIN (SELECT parcelid,
				  logerror,
				  Max(transactiondate) transactiondate 
		   FROM   predictions_2017 
		   GROUP  BY parcelid, logerror) pred
	   USING (parcelid)
JOIN propertylandusetype USING (propertylandusetypeid)
LEFT JOIN airconditioningtype air USING (airconditioningtypeid) 
LEFT JOIN architecturalstyletype arch USING (architecturalstyletypeid) 
LEFT JOIN buildingclasstype build USING (buildingclasstypeid) 
LEFT JOIN heatingorsystemtype heat USING (heatingorsystemtypeid) 
LEFT JOIN propertylandusetype landuse USING (propertylandusetypeid) 
LEFT JOIN storytype story USING (storytypeid) 
LEFT JOIN typeconstructiontype construct USING (typeconstructiontypeid) 
WHERE  prop.latitude IS NOT NULL 
AND prop.longitude IS NOT NULL
AND transactiondate < '2018-01-01' 
AND propertylandusetypeid = 261 
"""

In [5]:
# load zillow data from saved csv or pull from sql server and save to csv
import os
file = 'zillow_data.csv'
if os.path.isfile(file):
    df = pd.read_csv(file, index_col=0)
else:
    df = pd.read_sql(sql,url)
    df.to_csv(file)

In [6]:
df.head()

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,propertylandusetypeid,id,basementsqft,...,censustractandblock,propertylandusedesc,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,storydesc,typeconstructiondesc
0,14297519,,,,,,,261.0,1727539,,...,60590630000000.0,Single Family Residential,0.025595,2017-01-01,,,,,,
1,17052889,,,,,,,261.0,1387261,,...,61110010000000.0,Single Family Residential,0.055619,2017-01-01,,,,,,
2,14186244,,,,,,,261.0,11677,,...,60590220000000.0,Single Family Residential,0.005383,2017-01-01,,,,,,
3,12177905,,,2.0,,,,261.0,2288172,,...,60373000000000.0,Single Family Residential,-0.10341,2017-01-01,,,,Central,,
4,12095076,,,2.0,,,1.0,261.0,781532,,...,60374610000000.0,Single Family Residential,-0.001011,2017-01-01,Central,,,Central,,


In [7]:
# df shape
df.shape

(52438, 68)

In [8]:
df.parcelid.duplicated().sum()

118

In [9]:
# Sort df by parcelid and then by transaction date to group by parcelid and to make sure transaction date is last
df = df.sort_values(by=['parcelid','transactiondate'])
df.head()

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,propertylandusetypeid,id,basementsqft,...,censustractandblock,propertylandusedesc,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,storydesc,typeconstructiondesc
37223,10711855,,,2.0,,,,261.0,1087254,,...,60371130000000.0,Single Family Residential,-0.007357,2017-07-07,,,,Central,,
48246,10711877,,,2.0,,,1.0,261.0,1072280,,...,60371130000000.0,Single Family Residential,0.021066,2017-08-29,Central,,,Central,,
15536,10711888,,,2.0,,,1.0,261.0,1340933,,...,60371130000000.0,Single Family Residential,0.077174,2017-04-04,Central,,,Central,,
12106,10711910,,,2.0,,,,261.0,1878109,,...,60371130000000.0,Single Family Residential,-0.041238,2017-03-17,,,,Central,,
13666,10711923,,,2.0,,,,261.0,2190858,,...,60371130000000.0,Single Family Residential,-0.009496,2017-03-24,,,,Central,,


In [10]:
df[df.duplicated(subset='parcelid', keep=False)].head()

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,propertylandusetypeid,id,basementsqft,...,censustractandblock,propertylandusedesc,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,storydesc,typeconstructiondesc
9384,10722858,,,2.0,,,,261.0,16179,,...,60371350000000.0,Single Family Residential,0.095171,2017-03-02,,,,Central,,
9385,10722858,,,2.0,,,,261.0,16179,,...,60371350000000.0,Single Family Residential,-0.172843,2017-07-28,,,,Central,,
9306,10732347,,,2.0,,,,261.0,1836115,,...,60371370000000.0,Single Family Residential,0.077198,2017-03-01,,,,Central,,
9307,10732347,,,2.0,,,,261.0,1836115,,...,60371370000000.0,Single Family Residential,-0.221145,2017-07-25,,,,Central,,
1940,10739478,,,2.0,,,1.0,261.0,2119208,,...,60378000000000.0,Single Family Residential,0.08328,2017-01-13,Central,,,Central,,


In [11]:
df = df.drop_duplicates(subset='parcelid', keep='last')

In [12]:
df.parcelid.duplicated().sum()

0

In [13]:
df.shape

(52320, 68)

In [14]:
# Replace blank values with NaN
df = df.replace('',np.nan)

In [15]:
# check if long/lat columns have nulls
df.longitude.isnull().sum(), df.latitude.isnull().sum()

(0, 0)

***

### 2. Summarize your data (summary stats, info, dtypes, shape, distributions, value_counts, etc.)

In [16]:
# Describe without scientific notation
df.describe().apply(lambda s: s.apply(lambda x: format(x, 'g')))

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,propertylandusetypeid,id,basementsqft,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyyear,censustractandblock,logerror,buildingclassdesc
count,52320.0,76.0,47,33850.0,0.0,70.0,13615.0,52320,52320.0,47.0,...,81,52238.0,52319.0,52320,52319.0,52316.0,2068.0,52199.0,52320.0,0.0
mean,12996800.0,5.97368,7,3.96561,,7.1,2.43959,261,1496910.0,678.979,...,1,196636.0,529824.0,2016,333492.0,6454.74,14.1011,60502400000000.0,0.0181378,
std,3350920.0,0.229416,0,2.56266,,2.66567,3.84793,0,859433.0,711.825,...,0,254286.0,751830.0,0,570511.0,8752.48,2.4004,1861130000000.0,0.176903,
min,10711900.0,4.0,7,1.0,,2.0,1.0,261,349.0,38.0,...,1,129.0,1000.0,2016,161.0,49.18,4.0,60371000000000.0,-4.65542,
25%,11510200.0,6.0,7,2.0,,7.0,1.0,261,757614.0,263.5,...,1,77159.0,194033.0,2016,76194.0,2660.98,14.0,60374000000000.0,-0.0247003,
50%,12578300.0,6.0,7,2.0,,7.0,1.0,261,1500130.0,512.0,...,1,131905.0,374006.0,2016,218079.0,4650.57,15.0,60376200000000.0,0.00694008,
75%,14130400.0,6.0,7,7.0,,7.0,1.0,261,2241330.0,809.5,...,1,226453.0,619354.0,2016,408777.0,7379.27,15.0,60590400000000.0,0.0406021,
max,167688000.0,6.0,7,24.0,,21.0,13.0,261,2982270.0,3560.0,...,1,9164900.0,49061200.0,2016,48952200.0,586639.0,99.0,483030000000000.0,5.263,


In [17]:
df.shape

(52320, 68)

In [18]:
# df.hist()
# plt.tight_layout();

In [19]:
# for col in df.columns:
#     df[col].hist()

In [20]:
# for col in df.columns:
#     print(f'_-------------- {col} ---------------')
#     print(df[col].value_counts())

In [21]:
#df.info(verbose=True)

***

### 3. Write a function that takes in a dataframe of observations and attributes and returns a dataframe where each row is an atttribute name, the first column is the number of rows with missing values for that attribute, and the second column is percent of total rows that have missing values for that attribute. Run the function and document takeaways from this on how you want to handle missing values.

In [22]:
# show nulls per column
df.isna().sum()

parcelid                      0
typeconstructiontypeid    52244
storytypeid               52273
heatingorsystemtypeid     18470
buildingclasstypeid       52320
                          ...  
architecturalstyledesc    52250
buildingclassdesc         52320
heatingorsystemdesc       18470
storydesc                 52273
typeconstructiondesc      52244
Length: 68, dtype: int64

In [23]:
# show average nulls per column
df.isna().mean()

parcelid                  0.000000
typeconstructiontypeid    0.998547
storytypeid               0.999102
heatingorsystemtypeid     0.353020
buildingclasstypeid       1.000000
                            ...   
architecturalstyledesc    0.998662
buildingclassdesc         1.000000
heatingorsystemdesc       0.353020
storydesc                 0.999102
typeconstructiondesc      0.998547
Length: 68, dtype: float64

In [24]:
# create dataframe that has column name as first column
col_nulls = pd.DataFrame()
col_nulls['columns_name'] = df.isna().sum().index
col_nulls

Unnamed: 0,columns_name
0,parcelid
1,typeconstructiontypeid
2,storytypeid
3,heatingorsystemtypeid
4,buildingclasstypeid
...,...
63,architecturalstyledesc
64,buildingclassdesc
65,heatingorsystemdesc
66,storydesc


In [25]:
# create new column that hold the sum of nulls from each column
col_nulls['row_null_count'] = df.isna().sum().values
col_nulls

Unnamed: 0,columns_name,row_null_count
0,parcelid,0
1,typeconstructiontypeid,52244
2,storytypeid,52273
3,heatingorsystemtypeid,18470
4,buildingclasstypeid,52320
...,...,...
63,architecturalstyledesc,52250
64,buildingclassdesc,52320
65,heatingorsystemdesc,18470
66,storydesc,52273


In [26]:
# create new column that hold the average of nulls from each column
col_nulls['row_null_percent'] = df.isna().mean().values
col_nulls

Unnamed: 0,columns_name,row_null_count,row_null_percent
0,parcelid,0,0.000000
1,typeconstructiontypeid,52244,0.998547
2,storytypeid,52273,0.999102
3,heatingorsystemtypeid,18470,0.353020
4,buildingclasstypeid,52320,1.000000
...,...,...,...
63,architecturalstyledesc,52250,0.998662
64,buildingclassdesc,52320,1.000000
65,heatingorsystemdesc,18470,0.353020
66,storydesc,52273,0.999102


In [27]:
# sort values by percent
col_nulls = col_nulls.sort_values(by=['row_null_percent'], ascending=False)
col_nulls

Unnamed: 0,columns_name,row_null_count,row_null_percent
19,finishedsquarefeet15,52320,1.000000
4,buildingclasstypeid,52320,1.000000
64,buildingclassdesc,52320,1.000000
18,finishedsquarefeet13,52320,1.000000
2,storytypeid,52273,0.999102
...,...,...,...
11,bedroomcnt,0,0.000000
29,longitude,0,0.000000
28,latitude,0,0.000000
22,fips,0,0.000000


1. Write a function that takes in a dataframe and returns a dataframe with 3 columns: the number of columns missing, percent of columns missing, and number of rows with n columns missing. Run the function and document takeaways from this on how you want to handle missing values.

In [28]:
# Ravinder Exmaple
df2 = pd.DataFrame(df.isnull().sum(axis =1), columns = ['num_cols_missing']).reset_index().groupby('num_cols_missing')\
.count().reset_index().rename(columns = {'index': 'num_rows'})

df2['pct_cols_missing'] = df2.num_cols_missing/df.shape[1]
df2.head()

Unnamed: 0,num_cols_missing,num_rows,pct_cols_missing
0,23,2,0.338235
1,24,12,0.352941
2,25,11,0.367647
3,26,30,0.382353
4,27,177,0.397059


In [29]:
# shows # of null col per row
df.isnull().sum(axis=1)

37223    31
48246    29
15536    31
12106    31
13666    31
         ..
42306    35
31148    36
23911    36
9001     48
28113    48
Length: 52320, dtype: int64

In [30]:
# shows percent of null cols per row
df.isnull().mean(axis=1)

37223    0.455882
48246    0.426471
15536    0.455882
12106    0.455882
13666    0.455882
           ...   
42306    0.514706
31148    0.529412
23911    0.529412
9001     0.705882
28113    0.705882
Length: 52320, dtype: float64

In [31]:
# show # of rows with specific number of null cols
df.isna().sum(axis=1).value_counts()

33    11967
34    11158
32     8885
31     5989
36     4138
35     3469
29     2527
30     2199
37     1020
28      390
38      223
27      177
44       46
26       30
39       29
40       15
24       12
25       11
43       10
42        7
45        6
41        6
23        2
48        2
46        1
47        1
dtype: int64

In [32]:
# shows column nulls in each row
dft = pd.DataFrame(df.isnull().sum(axis=1), columns=['col_nulls'])
dft.head()

Unnamed: 0,col_nulls
37223,31
48246,29
15536,31
12106,31
13666,31


In [33]:
# shows index as id
dft = dft.reset_index()
dft.head()

Unnamed: 0,index,col_nulls
0,37223,31
1,48246,29
2,15536,31
3,12106,31
4,13666,31


In [34]:
# sbow # rows that have a specific number of null columns
dft.col_nulls.value_counts()

33    11967
34    11158
32     8885
31     5989
36     4138
35     3469
29     2527
30     2199
37     1020
28      390
38      223
27      177
44       46
26       30
39       29
40       15
24       12
25       11
43       10
42        7
45        6
41        6
23        2
48        2
46        1
47        1
Name: col_nulls, dtype: int64

In [35]:
# groupby 'null count' to display # of rows have the same # of null cols
dft = dft.groupby('col_nulls').count()
dft.head()

Unnamed: 0_level_0,index
col_nulls,Unnamed: 1_level_1
23,2
24,12
25,11
26,30
27,177


In [36]:
dft  = dft.reset_index()
dft.head()

Unnamed: 0,col_nulls,index
0,23,2
1,24,12
2,25,11
3,26,30
4,27,177


In [37]:
dft.rename(columns = {'index': 'num_row_missing_cols' }, inplace=True)
dft.head()

Unnamed: 0,col_nulls,num_row_missing_cols
0,23,2
1,24,12
2,25,11
3,26,30
4,27,177


In [38]:
dft['col_null_percent'] = df.isnull().mean(axis=1)
dft.head()

Unnamed: 0,col_nulls,num_row_missing_cols,col_null_percent
0,23,2,0.529412
1,24,12,0.485294
2,25,11,0.5
3,26,30,0.470588
4,27,177,0.426471


In [39]:
# Create df with number of rows with a specific number of null columns
row_nulls = pd.DataFrame(df.isna().sum(axis=1).value_counts(), columns=['num_rows_with_n_null_cols'])
row_nulls.head()

Unnamed: 0,num_rows_with_n_null_cols
33,11967
34,11158
32,8885
31,5989
36,4138


In [40]:
# make first columnb the number of nulls
row_nulls = row_nulls.reset_index()
row_nulls.head()

Unnamed: 0,index,num_rows_with_n_null_cols
0,33,11967
1,34,11158
2,32,8885
3,31,5989
4,36,4138


In [41]:
# rename index to match values
row_nulls = row_nulls.rename(columns={'index':'n_null_cols'})
row_nulls.head()

Unnamed: 0,n_null_cols,num_rows_with_n_null_cols
0,33,11967
1,34,11158
2,32,8885
3,31,5989
4,36,4138


In [42]:
# create columsn for percent of null cols
row_nulls['percent_null_cols'] = row_nulls.n_null_cols / df.shape[1]
row_nulls.head()

Unnamed: 0,n_null_cols,num_rows_with_n_null_cols,percent_null_cols
0,33,11967,0.485294
1,34,11158,0.5
2,32,8885,0.470588
3,31,5989,0.455882
4,36,4138,0.529412


In [43]:
# sort df by percentn of null cols
row_nulls = row_nulls.sort_values(by=['percent_null_cols'], ascending=False)
row_nulls

Unnamed: 0,n_null_cols,num_rows_with_n_null_cols,percent_null_cols
23,48,2,0.705882
25,47,1,0.691176
24,46,1,0.676471
20,45,6,0.661765
12,44,46,0.647059
18,43,10,0.632353
19,42,7,0.617647
21,41,6,0.602941
15,40,15,0.588235
14,39,29,0.573529


***

## Prepare

### 1a Remove any properties that are likely to be something other than single unit properties. 

In [44]:
# show value counts for each column, does not include nulls
for col in df.columns:
    print(f'-------------{col}-----------------')
    print(df[df[col].notna()][col].value_counts())

-------------parcelid-----------------
10913809    1
14004381    1
12888190    1
14243968    1
11227265    1
           ..
14147054    1
13096431    1
12660208    1
12789233    1
12853244    1
Name: parcelid, Length: 52320, dtype: int64
-------------typeconstructiontypeid-----------------
6.0    75
4.0     1
Name: typeconstructiontypeid, dtype: int64
-------------storytypeid-----------------
7.0    47
Name: storytypeid, dtype: int64
-------------heatingorsystemtypeid-----------------
2.0     20689
7.0     12527
6.0       517
20.0       85
13.0       16
1.0         7
18.0        6
10.0        2
24.0        1
Name: heatingorsystemtypeid, dtype: int64
-------------buildingclasstypeid-----------------
Series([], Name: buildingclasstypeid, dtype: int64)
-------------architecturalstyletypeid-----------------
7.0     62
3.0      3
2.0      2
21.0     2
8.0      1
Name: architecturalstyletypeid, dtype: int64
-------------airconditioningtypeid-----------------
1.0     11873
13.0     1567
5.0   

Name: typeconstructiondesc, dtype: int64


In [45]:
df.shape

(52320, 68)

In [46]:
df.unitcnt.isna().sum()

18558

In [47]:
# remove propeties that have no bedrooms and no bathrooms and too small of an area
df = df[(df.bedroomcnt > 0) & (df.bathroomcnt > 0) & (df.unitcnt <= 1) | df.unitcnt.isna() & (df.calculatedfinishedsquarefeet > 500)]

In [48]:
df.shape

(52147, 68)

***

### Create a function that will drop rows or columns based on the percent of values that are missing: handle_missing_values(df, prop_required_column, prop_required_row)

In [49]:
# Ravinder's Example
# def handle_missing_values(df, prop_required_column = .5, prop_required_row = .70):
#     threshold = int(round(prop_required_column*len(df.index),0))
#     df.dropna(axis=1, thresh=threshold, inplace=True)
#     threshold = int(round(prop_required_row*len(df.columns),0))
#     df.dropna(axis=0, thresh=threshold, inplace=True)
#     return df

In [50]:
df.index

Int64Index([37223, 48246, 15536, 12106, 13666,  4381, 36294, 10462, 42662,
             7623,
            ...
            50786,  6576,  8832,  6809, 49219, 37075,  2794, 42306, 31148,
            23911],
           dtype='int64', length=52147)

In [51]:
df.shape

(52147, 68)

In [52]:
len(df.index)

52147

In [53]:
len(df.columns)

68

In [54]:
dft = df.copy()
dft.shape

(52147, 68)

In [55]:
thresh_col = .5 * dft.shape[0]
thresh_col

26073.5

In [56]:
dft = dft.dropna(axis=1, thresh=thresh_col)
dft.shape

(52147, 34)

In [57]:
thresh_row = .7 * dft.shape[1]
thresh_row

23.799999999999997

In [58]:
dft.dropna(axis=0, thresh=thresh_row).shape

(52147, 34)

In [59]:
def handle_missing_values(df, percent_required_cols = .5, percent_required_rows = .7):
    
    # set threshold for min of values in columns for dropping
    thresh_col = int(round(percent_required_cols * df.shape[0]))
    
    # drop columns that don't meed threshhold for non-null values (rows without nulls)
    df = df.dropna(axis=1, thresh=thresh_col)
    
    # set threshold for min non-null values for rows (cols without nulls)
    thresh_row = int(round(percent_required_rows * df.shape[1]))
    
    # drop rows with don't meet threshold for non-null values for columns
    df = df.dropna(axis=0, thresh=thresh_row)
    
    return df, thresh_col, thresh_row,

In [60]:
df, thresh_col, thresh_row  = handle_missing_values(df, percent_required_cols = .5, percent_required_rows = .5)

In [61]:
df.shape, thresh_col, thresh_row

((52147, 34), 26074, 17)

***

### Deal with the rest of the Nulls

In [62]:
df.shape

(52147, 34)

In [63]:
df.isna().sum()

parcelid                            0
heatingorsystemtypeid           18334
propertylandusetypeid               0
id                                  0
bathroomcnt                         0
bedroomcnt                          0
buildingqualitytypeid           18536
calculatedbathnbr                  38
calculatedfinishedsquarefeet        0
finishedsquarefeet12              159
fips                                0
fullbathcnt                        38
latitude                            0
longitude                           0
lotsizesquarefeet                 350
propertycountylandusecode           0
propertyzoningdesc              18471
rawcensustractandblock              0
regionidcity                     1030
regionidcounty                      0
regionidzip                        23
roomcnt                             0
unitcnt                         18456
yearbuilt                          37
structuretaxvaluedollarcnt         69
taxvaluedollarcnt                   1
assessmentye

In [64]:
# remove columns that are not useful
df = df.drop(columns=['parcelid',
                'id',
                 #Description of the allowed land uses (zoning) for that property
                 'propertyzoningdesc', 
                 # Finished living area
                 'finishedsquarefeet12',
                 #  Census tract and block ID combined - also contains blockgroup assignment by extension
                 'censustractandblock',
                      # Type of land use the property is zoned for
                      'propertylandusetypeid',
                      #  Type of home heating system
                      'heatingorsystemtypeid',
                      
                 
                ])

In [65]:
df.shape

(52147, 27)

In [66]:
df.isna().sum()

bathroomcnt                         0
bedroomcnt                          0
buildingqualitytypeid           18536
calculatedbathnbr                  38
calculatedfinishedsquarefeet        0
fips                                0
fullbathcnt                        38
latitude                            0
longitude                           0
lotsizesquarefeet                 350
propertycountylandusecode           0
rawcensustractandblock              0
regionidcity                     1030
regionidcounty                      0
regionidzip                        23
roomcnt                             0
unitcnt                         18456
yearbuilt                          37
structuretaxvaluedollarcnt         69
taxvaluedollarcnt                   1
assessmentyear                      0
landtaxvaluedollarcnt               1
taxamount                           4
propertylandusedesc                 0
logerror                            0
transactiondate                     0
heatingorsys

In [67]:
df.heatingorsystemdesc.value_counts()

Central       20671
Floor/Wall    12508
Forced air      517
Solar            85
None             16
Baseboard         7
Radiant           6
Gravity           2
Yes               1
Name: heatingorsystemdesc, dtype: int64

In [68]:
df[df.fips == 6037].heatingorsystemdesc.value_counts(dropna=False)

Central       20671
Floor/Wall    12479
NaN             503
Solar            85
Name: heatingorsystemdesc, dtype: int64

In [69]:
df[df.fips == 6059].heatingorsystemdesc.value_counts(dropna=False)

NaN           13469
Forced air      517
Floor/Wall       29
None             16
Baseboard         7
Radiant           6
Gravity           2
Yes               1
Name: heatingorsystemdesc, dtype: int64

In [70]:
df[df.fips == 6111].heatingorsystemdesc.value_counts(dropna=False)

NaN    4362
Name: heatingorsystemdesc, dtype: int64

In [71]:
# relacing nulls with 'None'
df.heatingorsystemdesc.fillna('None', inplace=True)

In [72]:
df.isna().sum()

bathroomcnt                         0
bedroomcnt                          0
buildingqualitytypeid           18536
calculatedbathnbr                  38
calculatedfinishedsquarefeet        0
fips                                0
fullbathcnt                        38
latitude                            0
longitude                           0
lotsizesquarefeet                 350
propertycountylandusecode           0
rawcensustractandblock              0
regionidcity                     1030
regionidcounty                      0
regionidzip                        23
roomcnt                             0
unitcnt                         18456
yearbuilt                          37
structuretaxvaluedollarcnt         69
taxvaluedollarcnt                   1
assessmentyear                      0
landtaxvaluedollarcnt               1
taxamount                           4
propertylandusedesc                 0
logerror                            0
transactiondate                     0
heatingorsys

In [73]:
df.buildingqualitytypeid.value_counts(dropna=False)

NaN     18536
6.0     10250
8.0      8219
4.0      8126
7.0      3420
5.0      1497
9.0      1118
11.0      514
10.0      229
3.0       147
12.0       80
1.0         8
2.0         3
Name: buildingqualitytypeid, dtype: int64

In [74]:
df[df.fips == 6037].buildingqualitytypeid.value_counts(dropna=False)

6.0     10250
8.0      8219
4.0      8126
7.0      3420
5.0      1497
9.0      1118
11.0      514
10.0      229
3.0       147
NaN       127
12.0       80
1.0         8
2.0         3
Name: buildingqualitytypeid, dtype: int64

In [75]:
df[df.fips == 6059].buildingqualitytypeid.value_counts(dropna=False)

NaN    14047
Name: buildingqualitytypeid, dtype: int64

In [76]:
df[df.fips == 6111].buildingqualitytypeid.value_counts(dropna=False)

NaN    4362
Name: buildingqualitytypeid, dtype: int64

In [77]:
# droping buildingqualitytypeid because they are null for Ventura and Orange
df.drop(columns=['buildingqualitytypeid'], inplace=True)

In [78]:
df.isna().sum()

bathroomcnt                         0
bedroomcnt                          0
calculatedbathnbr                  38
calculatedfinishedsquarefeet        0
fips                                0
fullbathcnt                        38
latitude                            0
longitude                           0
lotsizesquarefeet                 350
propertycountylandusecode           0
rawcensustractandblock              0
regionidcity                     1030
regionidcounty                      0
regionidzip                        23
roomcnt                             0
unitcnt                         18456
yearbuilt                          37
structuretaxvaluedollarcnt         69
taxvaluedollarcnt                   1
assessmentyear                      0
landtaxvaluedollarcnt               1
taxamount                           4
propertylandusedesc                 0
logerror                            0
transactiondate                     0
heatingorsystemdesc                 0
dtype: int64

In [79]:
df.unitcnt.value_counts(dropna=False)

1.0    33691
NaN    18456
Name: unitcnt, dtype: int64

In [80]:
df[df.fips == 6037].unitcnt.value_counts(dropna=False)

1.0    33686
NaN       52
Name: unitcnt, dtype: int64

In [81]:
df[df.fips == 6059].unitcnt.value_counts(dropna=False)

NaN    14042
1.0        5
Name: unitcnt, dtype: int64

In [82]:
df[df.fips == 6111].unitcnt.value_counts(dropna=False)

NaN    4362
Name: unitcnt, dtype: int64

In [83]:
# drop unitcnt, no longer needed
df.drop(columns=['unitcnt'], inplace=True)

In [84]:
df.isna().sum()

bathroomcnt                        0
bedroomcnt                         0
calculatedbathnbr                 38
calculatedfinishedsquarefeet       0
fips                               0
fullbathcnt                       38
latitude                           0
longitude                          0
lotsizesquarefeet                350
propertycountylandusecode          0
rawcensustractandblock             0
regionidcity                    1030
regionidcounty                     0
regionidzip                       23
roomcnt                            0
yearbuilt                         37
structuretaxvaluedollarcnt        69
taxvaluedollarcnt                  1
assessmentyear                     0
landtaxvaluedollarcnt              1
taxamount                          4
propertylandusedesc                0
logerror                           0
transactiondate                    0
heatingorsystemdesc                0
dtype: int64

In [85]:
# dropping the rest of the nulls
df.dropna(inplace=True)

In [86]:
df.isna().sum()

bathroomcnt                     0
bedroomcnt                      0
calculatedbathnbr               0
calculatedfinishedsquarefeet    0
fips                            0
fullbathcnt                     0
latitude                        0
longitude                       0
lotsizesquarefeet               0
propertycountylandusecode       0
rawcensustractandblock          0
regionidcity                    0
regionidcounty                  0
regionidzip                     0
roomcnt                         0
yearbuilt                       0
structuretaxvaluedollarcnt      0
taxvaluedollarcnt               0
assessmentyear                  0
landtaxvaluedollarcnt           0
taxamount                       0
propertylandusedesc             0
logerror                        0
transactiondate                 0
heatingorsystemdesc             0
dtype: int64

***

In [87]:
import wrangle

In [88]:
df = wrangle.wrangle_zillow()
df.shape

AttributeError: 'tuple' object has no attribute 'shape'

In [1]:
df.logerror

NameError: name 'df' is not defined

In [None]:
def remove_outliers(df):

    # prep for outlier removal: not including categories fips, pools, zipcode
    cols_list = df.drop(columns=[
    'heatingorsystemtypeid',
     
     'calculatedbathnbr',
     'calculatedfinishedsquarefeet',
     'fips',
     'fullbathcnt',
     
  
     'propertycountylandusecode',
     'rawcensustractandblock',
     'regionidcity',
     'regionidcounty',
     'regionidzip',
     'roomcnt',
     'yearbuilt',
 
     
     'assessmentyear',
     'landtaxvaluedollarcnt',
    
     'propertylandusedesc',
     'logerror',
     'transactiondate'])


    # remove outliers from each column in cols_list
    for col in cols_list:

        q1, q3 = df[col].quantile([.25, .75])  # get quartiles

        iqr = q3 - q1   # calculate interquartile range

        upper_bound = q3 + 2 * iqr   # get upper bound
        lower_bound = q1 - 2 * iqr   # get lower bound

        # return dataframe without outliers

        df = df[(df[col] > lower_bound) & (df[col] < upper_bound)]
        
    return df

In [None]:
df = remove_outliers(df)

In [None]:
df.shape