In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np

# Exploring
import scipy.stats as stats

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Acquire data from mySQL using the python module to connect and query. You will want to end with a single dataframe. Make sure to include: the logerror, all fields related to the properties that are available. You will end up using all the tables in the database.

- Be sure to do the correct join (inner, outer, etc.). We do not want to eliminate properties purely because they may have a null value for airconditioningtypeid.
- Only include properties with a transaction in 2017, and include only the last transaction for each property (so no duplicate property ID's), along with zestimate error and date of transaction.
- Only include properties that include a latitude and longitude value.

In [2]:
# import env file for hostname, username, password, and db_name
from env import host, user, password, db_name

In [3]:
# Pass env file authentication to container 'url'
url = f'mysql+pymysql://{user}:{password}@{host}/{db_name}'

In [1]:
# define sql search for all records from all tables in Zillow database
sql = """
SELECT prop.*,
       pred.logerror, 
       pred.transactiondate, 
       air.airconditioningdesc, 
       arch.architecturalstyledesc, 
       build.buildingclassdesc, 
       heat.heatingorsystemdesc, 
       landuse.propertylandusedesc, 
       story.storydesc, 
       construct.typeconstructiondesc 

FROM   properties_2017 prop  
JOIN (SELECT parcelid,
				  logerror,
				  Max(transactiondate) transactiondate 
		   FROM   predictions_2017 
		   GROUP  BY parcelid, logerror) pred
	   USING (parcelid) 
LEFT JOIN airconditioningtype air USING (airconditioningtypeid) 
LEFT JOIN architecturalstyletype arch USING (architecturalstyletypeid) 
LEFT JOIN buildingclasstype build USING (buildingclasstypeid) 
LEFT JOIN heatingorsystemtype heat USING (heatingorsystemtypeid) 
LEFT JOIN propertylandusetype landuse USING (propertylandusetypeid) 
LEFT JOIN storytype story USING (storytypeid) 
LEFT JOIN typeconstructiontype construct USING (typeconstructiontypeid) 
WHERE  prop.latitude IS NOT NULL 
AND prop.longitude IS NOT NULL
AND transactiondate < '2018-01-01' 
"""

In [32]:
# load zillow data from saved csv or pull from sql server and save to csv
import os
file = 'zillow_data.csv'
if os.path.isfile(file):
    df = pd.read_csv(file, index_col=0)
else:
    df = pd.read_sql(sql,url)
    df.to_csv(file)

In [34]:
df.head()

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,propertylandusetypeid,id,basementsqft,...,censustractandblock,propertylandusedesc,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,storydesc,typeconstructiondesc
0,14297519,,,,,,,261.0,1727539,,...,60590630000000.0,Single Family Residential,0.025595,2017-01-01,,,,,,
1,17052889,,,,,,,261.0,1387261,,...,61110010000000.0,Single Family Residential,0.055619,2017-01-01,,,,,,
2,14186244,,,,,,,261.0,11677,,...,60590220000000.0,Single Family Residential,0.005383,2017-01-01,,,,,,
3,12177905,,,2.0,,,,261.0,2288172,,...,60373000000000.0,Single Family Residential,-0.10341,2017-01-01,,,,Central,,
4,12095076,,,2.0,,,1.0,261.0,781532,,...,60374610000000.0,Single Family Residential,-0.001011,2017-01-01,Central,,,Central,,


In [33]:
# df_3 shape
df.shape

(52438, 68)

In [7]:
df.parcelid.duplicated().sum()

118

In [8]:
# Sort df by parcelid and then by transaction date to group by parcelid and to make sure transaction date is last
df = df.sort_values(by=['parcelid','transactiondate'])
df.head()

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,propertylandusetypeid,id,basementsqft,...,censustractandblock,propertylandusedesc,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,storydesc,typeconstructiondesc
37223,10711855,,,2.0,,,,261.0,1087254,,...,60371130000000.0,Single Family Residential,-0.007357,2017-07-07,,,,Central,,
48246,10711877,,,2.0,,,1.0,261.0,1072280,,...,60371130000000.0,Single Family Residential,0.021066,2017-08-29,Central,,,Central,,
15536,10711888,,,2.0,,,1.0,261.0,1340933,,...,60371130000000.0,Single Family Residential,0.077174,2017-04-04,Central,,,Central,,
12106,10711910,,,2.0,,,,261.0,1878109,,...,60371130000000.0,Single Family Residential,-0.041238,2017-03-17,,,,Central,,
13666,10711923,,,2.0,,,,261.0,2190858,,...,60371130000000.0,Single Family Residential,-0.009496,2017-03-24,,,,Central,,


In [9]:
df = df.drop_duplicates(subset='parcelid', keep='last')

In [10]:
df.parcelid.duplicated().sum()

0

In [11]:
df.shape

(52320, 68)

In [12]:
# Replace blank values with NaN
df = df.replace('',np.nan)

In [13]:
# check if long/lat columns have nulls
df.longitude.isnull().sum(), df.latitude.isnull().sum()

(0, 0)

***

### 2. Summarize your data (summary stats, info, dtypes, shape, distributions, value_counts, etc.)

In [15]:
# Describe without scientific notation
df.describe().apply(lambda s: s.apply(lambda x: format(x, 'g')))

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,propertylandusetypeid,id,basementsqft,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyyear,censustractandblock,logerror,buildingclassdesc
count,52320.0,76.0,47,33850.0,0.0,70.0,13615.0,52320,52320.0,47.0,...,81,52238.0,52319.0,52320,52319.0,52316.0,2068.0,52199.0,52320.0,0.0
mean,12996800.0,5.97368,7,3.96561,,7.1,2.43959,261,1496910.0,678.979,...,1,196636.0,529824.0,2016,333492.0,6454.74,14.1011,60502400000000.0,0.0181378,
std,3350920.0,0.229416,0,2.56266,,2.66567,3.84793,0,859433.0,711.825,...,0,254286.0,751830.0,0,570511.0,8752.48,2.4004,1861130000000.0,0.176903,
min,10711900.0,4.0,7,1.0,,2.0,1.0,261,349.0,38.0,...,1,129.0,1000.0,2016,161.0,49.18,4.0,60371000000000.0,-4.65542,
25%,11510200.0,6.0,7,2.0,,7.0,1.0,261,757614.0,263.5,...,1,77159.0,194033.0,2016,76194.0,2660.98,14.0,60374000000000.0,-0.0247003,
50%,12578300.0,6.0,7,2.0,,7.0,1.0,261,1500130.0,512.0,...,1,131905.0,374006.0,2016,218079.0,4650.57,15.0,60376200000000.0,0.00694008,
75%,14130400.0,6.0,7,7.0,,7.0,1.0,261,2241330.0,809.5,...,1,226453.0,619354.0,2016,408777.0,7379.27,15.0,60590400000000.0,0.0406021,
max,167688000.0,6.0,7,24.0,,21.0,13.0,261,2982270.0,3560.0,...,1,9164900.0,49061200.0,2016,48952200.0,586639.0,99.0,483030000000000.0,5.263,


In [16]:
df.shape

(52320, 68)

In [17]:
# df.hist()
# plt.tight_layout();

In [18]:
# for col in df.columns:
#     df[col].hist()

In [19]:
# for col in df.columns:
#     print(f'_-------------- {col} ---------------')
#     print(df[col].value_counts())

In [20]:
#df.info(verbose=True)

***

### 3. Write a function that takes in a dataframe of observations and attributes and returns a dataframe where each row is an atttribute name, the first column is the number of rows with missing values for that attribute, and the second column is percent of total rows that have missing values for that attribute. Run the function and document takeaways from this on how you want to handle missing values.

In [21]:
# show nulls per column
df.isna().sum()

parcelid                      0
typeconstructiontypeid    52244
storytypeid               52273
heatingorsystemtypeid     18470
buildingclasstypeid       52320
                          ...  
architecturalstyledesc    52250
buildingclassdesc         52320
heatingorsystemdesc       18470
storydesc                 52273
typeconstructiondesc      52244
Length: 68, dtype: int64

In [40]:
# show nulls per column
df.isna().mean()

parcelid                  0.000000
typeconstructiontypeid    0.998551
storytypeid               0.999104
heatingorsystemtypeid     0.352874
buildingclasstypeid       1.000000
                            ...   
architecturalstyledesc    0.998665
buildingclassdesc         1.000000
heatingorsystemdesc       0.352874
storydesc                 0.999104
typeconstructiondesc      0.998551
Length: 68, dtype: float64

In [43]:
col_null = pd.DataFrame()
col_null['columns_name'] = df.isna().sum().index
col_null

Unnamed: 0,columns_name
0,parcelid
1,typeconstructiontypeid
2,storytypeid
3,heatingorsystemtypeid
4,buildingclasstypeid
...,...
64,buildingclassdesc
65,heatingorsystemdesc
66,storydesc
67,typeconstructiondesc


In [44]:
col_null['null_count'] = df.isna().sum().values
col_null

In [45]:
col_null['null_percent'] = df.isna().mean().values
col_null

Unnamed: 0,columns_name,null_count,null_percent
0,parcelid,0,0.000000
1,typeconstructiontypeid,52362,0.998551
2,storytypeid,52391,0.999104
3,heatingorsystemtypeid,18504,0.352874
4,buildingclasstypeid,52438,1.000000
...,...,...,...
64,buildingclassdesc,52438,1.000000
65,heatingorsystemdesc,18504,0.352874
66,storydesc,52391,0.999104
67,typeconstructiondesc,52362,0.998551


1. Write a function that takes in a dataframe and returns a dataframe with 3 columns: the number of columns missing, percent of columns missing, and number of rows with n columns missing. Run the function and document takeaways from this on how you want to handle missing values.

In [48]:
# show the number of columns missing from each row
df.isna().sum(axis=1)

0        36
1        33
2        34
3        32
4        29
         ..
52433    34
52434    33
52435    32
52436    32
52437    34
Length: 52438, dtype: int64

In [49]:
# show the percent of columns missing from each row
df.isna().mean(axis=1)

0        0.521739
1        0.478261
2        0.492754
3        0.463768
4        0.420290
           ...   
52433    0.492754
52434    0.478261
52435    0.463768
52436    0.463768
52437    0.492754
Length: 52438, dtype: float64

In [52]:
# show the amount of rows missing specific number of column
df.isna().sum(axis=1).value_counts()

33    11995
34    11180
32     8901
31     6004
36     4147
35     3478
29     2531
30     2204
37     1023
28      392
38      227
27      177
44       47
26       30
39       29
40       15
24       12
25       11
43       10
42        7
45        6
41        6
23        2
48        2
46        1
47        1
dtype: int64

In [54]:
row_nulls = pd.DataFrame()
row_nulls['num_cols_missing'] = df.isna().sum().values
row_nulls

Unnamed: 0,num_cols_missing
0,0
1,52362
2,52391
3,18504
4,52438
...,...
64,52438
65,18504
66,52391
67,52362


In [55]:
row_nulls['percent_cols_missing'] = df.isna().mean().values
row_nulls

Unnamed: 0,num_cols_missing,percent_cols_missing
0,0,0.000000
1,52362,0.998551
2,52391,0.999104
3,18504,0.352874
4,52438,1.000000
...,...,...
64,52438,1.000000
65,18504,0.352874
66,52391,0.999104
67,52362,0.998551


In [60]:
df.isna().sum(axis=1).value_counts().values

array([11995, 11180,  8901,  6004,  4147,  3478,  2531,  2204,  1023,
         392,   227,   177,    47,    30,    29,    15,    12,    11,
          10,     7,     6,     6,     2,     2,     1,     1])

In [57]:
df.isna().sum(axis=1).value_counts()

33    11995
34    11180
32     8901
31     6004
36     4147
35     3478
29     2531
30     2204
37     1023
28      392
38      227
27      177
44       47
26       30
39       29
40       15
24       12
25       11
43       10
42        7
45        6
41        6
23        2
48        2
46        1
47        1
dtype: int64

In [56]:
row_nulls['num_row'] = df.isna().sum(axis=1).value_counts().values
row_nulls

ValueError: Length of values (26) does not match length of index (69)

In [53]:
row_nulls = pd.DataFrame()
row_nulls['num_cols_missing'] = df.isna().sum().values
row_nulls['percent_cols_missing'] = df.isna().mean().values
row_nulls['num_row'] = df.isna().sum(axis=1).value_counts().values
row_nulls

ValueError: Length of values (26) does not match length of index (69)

In [23]:
# transpose to make each row an attribute name
df = df.T
df.head()

Unnamed: 0,37223,48246,15536,12106,13666,4381,36294,10462,42662,7623,...,8832,6809,49219,37075,2794,42306,31148,23911,9001,28113
parcelid,10711855.0,10711877.0,10711888.0,10711910.0,10711923.0,10711945.0,10711956.0,10711995.0,10712005.0,10712007.0,...,167636496.0,167636512.0,167636726.0,167636727.0,167637054.0,167637371.0,167638003.0,167639152.0,167686999.0,167687839.0
typeconstructiontypeid,,,,,,,,,,,...,,,,,,,,,,
storytypeid,,,,,,,,,,,...,,,,,,,,,,
heatingorsystemtypeid,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,7.0,2.0,2.0,2.0,,
buildingclasstypeid,,,,,,,,,,,...,,,,,,,,,,


In [24]:
#df.info(verbose=True)

In [25]:
# Add new column to of NaN sums to column position 0
df.insert(0, 'null_count', df.isnull().sum(axis=1))

In [26]:
df.head()

Unnamed: 0,null_count,37223,48246,15536,12106,13666,4381,36294,10462,42662,...,8832,6809,49219,37075,2794,42306,31148,23911,9001,28113
parcelid,0,10711855.0,10711877.0,10711888.0,10711910.0,10711923.0,10711945.0,10711956.0,10711995.0,10712005.0,...,167636496.0,167636512.0,167636726.0,167636727.0,167637054.0,167637371.0,167638003.0,167639152.0,167686999.0,167687839.0
typeconstructiontypeid,52244,,,,,,,,,,...,,,,,,,,,,
storytypeid,52273,,,,,,,,,,...,,,,,,,,,,
heatingorsystemtypeid,18470,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,7.0,2.0,2.0,2.0,,
buildingclasstypeid,52320,,,,,,,,,,...,,,,,,,,,,


In [28]:
# Make second column as percent of total rows that have missing values for that attribute.
df.insert(1, 'null_percent', round((df.isnull().sum(axis=1) / df.shape[1])*100))
df.head()

Unnamed: 0,null_count,null_percent,37223,48246,15536,12106,13666,4381,36294,10462,...,8832,6809,49219,37075,2794,42306,31148,23911,9001,28113
parcelid,0,0.0,10711855.0,10711877.0,10711888.0,10711910.0,10711923.0,10711945.0,10711956.0,10711995.0,...,167636496.0,167636512.0,167636726.0,167636727.0,167637054.0,167637371.0,167638003.0,167639152.0,167686999.0,167687839.0
typeconstructiontypeid,52244,100.0,,,,,,,,,...,,,,,,,,,,
storytypeid,52273,100.0,,,,,,,,,...,,,,,,,,,,
heatingorsystemtypeid,18470,35.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,7.0,2.0,2.0,2.0,,
buildingclasstypeid,52320,100.0,,,,,,,,,...,,,,,,,,,,


In [29]:
# show number of columsn with percent of nulls
df.null_percent.value_counts()

0.0      27
100.0    14
35.0      4
96.0      3
92.0      2
74.0      2
99.0      2
66.0      2
98.0      2
64.0      1
79.0      1
72.0      1
97.0      1
81.0      1
36.0      1
86.0      1
2.0       1
87.0      1
1.0       1
Name: null_percent, dtype: int64

### Notes:
- 14 columsn have all nulls

***

### 1. Write a function that takes in a dataframe and returns a dataframe with 3 columns: the number of columns missing, percent of columns missing, and number of rows with n columns missing. Run the function and document takeaways from this on how you want to handle missing values.

***