In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np

# Exploring
import scipy.stats as stats

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Acquire data from mySQL using the python module to connect and query. You will want to end with a single dataframe. Make sure to include: the logerror, all fields related to the properties that are available. You will end up using all the tables in the database.

- Be sure to do the correct join (inner, outer, etc.). We do not want to eliminate properties purely because they may have a null value for airconditioningtypeid.
- Only include properties with a transaction in 2017, and include only the last transaction for each property (so no duplicate property ID's), along with zestimate error and date of transaction.
- Only include properties that include a latitude and longitude value.

In [2]:
# import env file for hostname, username, password, and db_name
from env import host, user, password, db_name

In [3]:
# Pass env file authentication to container 'url'
url = f'mysql+pymysql://{user}:{password}@{host}/{db_name}'

In [19]:
# define sql search for all records from all tables in Zillow database
sql ='''
SELECT *
FROM properties_2017 prop17
JOIN propertylandusetype USING (propertylandusetypeid)
LEFT JOIN predictions_2017 pred17 USING (parcelid)
LEFT JOIN airconditioningtype USING (airconditioningtypeid)
LEFT JOIN architecturalstyletype USING (architecturalstyletypeid)
LEFT JOIN buildingclasstype USING (buildingclasstypeid)
LEFT JOIN heatingorsystemtype USING (heatingorsystemtypeid)
LEFT JOIN storytype USING (storytypeid)
LEFT JOIN typeconstructiontype USING (typeconstructiontypeid)
LEFT JOIN unique_properties USING (parcelid)
WHERE transactiondate < '2018-01-01' AND propertylandusetypeid='261'
    '''

In [76]:
# load zillow data from saved csv or pull from sql server and save to csv
import os
file = 'zillow_data_2.csv'
if os.path.isfile(file):
    df = pd.read_csv(file, index_col=0)
else:
    df = pd.read_sql(sql,url)
    df.to_csv(file)

In [77]:
df.shape

(52441, 69)

In [80]:
df.parcelid.drop_duplicates(keep='last')

0        14297519
1        17052889
2        14186244
3        12177905
4        12095076
           ...   
52436    12412492
52437    11000655
52438    17239384
52439    12773139
52440    12826780
Name: parcelid, Length: 52320, dtype: int64

In [74]:
df_no_dup.shape

(52441, 69)

In [69]:
df_no_dup = df.drop_duplicates(keep='last')
df_no_dup[df_no_dup.parcelid.duplicated(keep=False)]

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,propertylandusetypeid,id,basementsqft,...,propertylandusedesc,id.1,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,storydesc,typeconstructiondesc
671,11721753,,,7.0,,,,261.0,616260,,...,Single Family Residential,1017,-0.011052,2017-01-05,,,,Floor/Wall,,
672,11721753,,,7.0,,,,261.0,616260,,...,Single Family Residential,1018,0.017785,2017-07-21,,,,Floor/Wall,,
834,11289917,,,2.0,,,1.0,261.0,2061546,,...,Single Family Residential,1247,0.227903,2017-01-06,Central,,,Central,,
835,11289917,,,2.0,,,1.0,261.0,2061546,,...,Single Family Residential,1248,-0.362001,2017-06-23,Central,,,Central,,
1195,11705026,,,2.0,,,,261.0,1834372,,...,Single Family Residential,1771,-0.034286,2017-01-10,,,,Central,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38621,14448410,,,,,,,261.0,2309078,,...,Single Family Residential,57015,1.648574,2017-08-31,,,,,,
40594,13066981,,,2.0,,,1.0,261.0,2008746,,...,Single Family Residential,60009,0.016734,2017-07-24,Central,,,Central,,
40595,13066981,,,2.0,,,1.0,261.0,2008746,,...,Single Family Residential,60010,-0.043423,2017-09-01,Central,,,Central,,
43477,17282392,,,,,,,261.0,2938730,,...,Single Family Residential,64277,-0.002968,2017-08-07,,,,,,


In [73]:
df.drop_duplicates(keep='last')[['parcelid','transactiondate']].duplicated(keep=False).sum()

0

In [81]:
df.parcelid.duplicated(keep='first')

0        False
1        False
2        False
3        False
4        False
         ...  
52436    False
52437    False
52438    False
52439    False
52440    False
Name: parcelid, Length: 52441, dtype: bool

In [56]:
df_last = df[df.parcelid.duplicated(keep='first')]
df_last[['parcelid','transactiondate']]

Unnamed: 0,parcelid,transactiondate
672,11721753,2017-07-21
835,11289917,2017-06-23
1196,11705026,2017-06-30
1381,14269464,2017-06-01
1796,11446756,2017-08-23
...,...,...
35442,12621730,2017-08-29
36901,10956664,2017-08-31
38621,14448410,2017-08-31
40595,13066981,2017-09-01


In [50]:
df_1 = df[df.parcelid.duplicated(keep=False)]
df_1[['parcelid','transactiondate']]

Unnamed: 0,parcelid,transactiondate
671,11721753,2017-01-05
672,11721753,2017-07-21
834,11289917,2017-01-06
835,11289917,2017-06-23
1195,11705026,2017-01-10
...,...,...
38621,14448410,2017-08-31
40594,13066981,2017-07-24
40595,13066981,2017-09-01
43477,17282392,2017-08-07


In [18]:
df[df.duplicated()]

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,propertylandusetypeid,id,basementsqft,...,propertylandusedesc,id.1,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,storydesc,typeconstructiondesc


***

### 2. Summarize your data (summary stats, info, dtypes, shape, distributions, value_counts, etc.)

***

### 3. Write a function that takes in a dataframe of observations and attributes and returns a dataframe where each row is an atttribute name, the first column is the number of rows with missing values for that attribute, and the second column is percent of total rows that have missing values for that attribute. Run the function and document takeaways from this on how you want to handle missing values.

***

### 1. Write a function that takes in a dataframe and returns a dataframe with 3 columns: the number of columns missing, percent of columns missing, and number of rows with n columns missing. Run the function and document takeaways from this on how you want to handle missing values.

***