In [1]:
#basic imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

import env
from os.path import exists

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


# Acquire and Summarize

You will want to end with a single dataframe. Include the logerror field and all other fields related to the properties that are available. You will end up using all the tables in the database.

- Be sure to do the correct join (inner, outer, etc.). We do not want to eliminate properties purely because they may have a null value for airconditioningtypeid.  
- Only include properties with a transaction in 2017, and include only the last transaction for each property (so no duplicate property ID's), along with zestimate error and date of transaction. (Hint: read the docs for the .duplicated method)   
- Only include properties that have a latitude and longitude value.

In [2]:

def get_connection(db, user=env.user, host=env.host, password=env.password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

def get_zillow_data():
    '''
    Reads in all fields from the customers table in the mall_customers schema from data.codeup.com
    
    parameters: None
    
    returns: a single Pandas DataFrame with the index set to the primary customer_id field
    '''

    sql = """
    SELECT *, properties_2017.id as property_id
    FROM properties_2017 
    LEFT JOIN propertylandusetype USING (propertylandusetypeid)
    LEFT JOIN predictions_2017 USING (parcelid)
    LEFT JOIN unique_properties USING (parcelid)
    LEFT JOIN airconditioningtype USING (airconditioningtypeid)
    LEFT JOIN architecturalstyletype USING (architecturalstyletypeid)
    LEFT JOIN buildingclasstype USING (buildingclasstypeid)
    LEFT JOIN heatingorsystemtype USING (heatingorsystemtypeid)
    LEFT JOIN storytype USING (storytypeid)
    LEFT JOIN typeconstructiontype USING (typeconstructiontypeid)

    WHERE propertylandusedesc IN ("Single Family Residential", "Inferred Single Family Residential") 
        AND transactiondate like '%%2017%%'
    """

    if exists('zillow_data.csv'):
        df = pd.read_csv('zillow_data.csv')
    else:
        df = pd.read_sql(sql, get_connection('zillow'))
    return df

In [3]:
df = get_zillow_data()

In [4]:
df = df.drop_duplicates(subset='property_id', keep='last')

In [5]:
#two brackets to return a dataframe of two columns
# print(df[df.duplicated(subset= 'property_id', keep=False)][['property_id','transactiondate']].sort_values(by=['property_id', 'transactiondate']))
# df[df.duplicated(subset= 'property_id', keep='last')][['property_id','transactiondate']].sort_values(by=['property_id', 'transactiondate'])

In [6]:
df.info() # --> no nulls in lat/longitude value

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52320 entries, 0 to 52440
Data columns (total 70 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   typeconstructiontypeid        76 non-null     float64
 1   storytypeid                   47 non-null     float64
 2   heatingorsystemtypeid         33850 non-null  float64
 3   buildingclasstypeid           0 non-null      object 
 4   architecturalstyletypeid      70 non-null     float64
 5   airconditioningtypeid         13615 non-null  float64
 6   parcelid                      52320 non-null  int64  
 7   propertylandusetypeid         52320 non-null  float64
 8   id                            52320 non-null  int64  
 9   basementsqft                  47 non-null     float64
 10  bathroomcnt                   52320 non-null  float64
 11  bedroomcnt                    52320 non-null  float64
 12  buildingqualitytypeid         33655 non-null  float64
 13  c

In [7]:
#drops the initial 'id' columns. property_id is a duplicated version for this reason
df.drop(df.columns[60], axis=1, inplace=True)

Summarize your data (summary stats, info, dtypes, shape, distributions, value_counts, etc.)



In [8]:
# examine row by row basics:

def nulls_by_row(df):
    num_missing = df.isnull().sum(axis=1)
    prnt_miss = num_missing / df.shape[1] * 100
    rows_missing = pd.DataFrame({'num_cols_missing': num_missing, 'percent_cols_missing': prnt_miss})
    rows_missing = rows_missing.reset_index().groupby(['num_cols_missing', 'percent_cols_missing']).count().reset_index().\

    return rows_missing

In [9]:
def nulls_by_col(df):
    num_missing = df.isnull().sum()
    percnt_miss = num_missing / df.shape[0] * 100
    cols_missing = pd.DataFrame(
        {
            'num_rows_missing': num_missing,
            'percent_rows_missing': percnt_miss
        }
    )
    return cols_missing

In [10]:
def summarize(df):
    print('DaataFrame head:\n')
    print(df.head().to_markdown())
    print('-----')
    print('DataFrame info:\n')
    print (df.info())
    print('---')
    print('DataFrame describe:\n')
    print (df.describe())
    print('---')
    print('DataFrame null value asssessment:\n')
    print('Nulls By Column:', nulls_by_col(df))
    print('----')
    print('Nulls By Row:', nulls_by_row(df))
    numerical_cols = df.select_dtypes(exclude='object').columns.to_list()
    categorical_cols = df.select_dtypes(include='object').columns.to_list()
    print('value_counts: \n')
    for col in df.columns:
        print(f'Column Names: {col}')
        if col in categorical_cols:
            print(df[col].value_counts())
        else:
            print(df[col].value_counts(bins=10, sort=False, dropna=False))
            print('---')
    print('Report Finished')
    return

In [11]:
summarize(df)

DaataFrame head:

|    |   typeconstructiontypeid |   storytypeid |   heatingorsystemtypeid | buildingclasstypeid   |   architecturalstyletypeid |   airconditioningtypeid |   parcelid |   propertylandusetypeid |   basementsqft |   bathroomcnt |   bedroomcnt |   buildingqualitytypeid |   calculatedbathnbr |   decktypeid |   finishedfloor1squarefeet |   calculatedfinishedsquarefeet |   finishedsquarefeet12 | finishedsquarefeet13   | finishedsquarefeet15   |   finishedsquarefeet50 |   finishedsquarefeet6 |   fips |   fireplacecnt |   fullbathcnt |   garagecarcnt |   garagetotalsqft |   hashottuborspa |    latitude |    longitude |   lotsizesquarefeet |   poolcnt |   poolsizesum |   pooltypeid10 |   pooltypeid2 |   pooltypeid7 |   propertycountylandusecode | propertyzoningdesc   |   rawcensustractandblock |   regionidcity |   regionidcounty |   regionidneighborhood |   regionidzip |   roomcnt |   threequarterbathnbr |   unitcnt |   yardbuildingsqft17 |   yardbuildingsqft26 |   yearbuilt | 

### Takeaways 
* 

Write a function that takes in a dataframe of observations and attributes and returns a dataframe where each row is an atttribute name, the first column is the number of rows with missing values for that attribute, and the second column is percent of total rows that have missing values for that attribute. Run the function and document takeaways from this on how you want to handle missing values.

In [12]:
cols_missing = nulls_by_col(df)

In [13]:
print(cols_missing)

                          num_rows_missing  percent_rows_missing
typeconstructiontypeid               52244             99.854740
storytypeid                          52273             99.910168
heatingorsystemtypeid                18470             35.301988
buildingclasstypeid                  52320            100.000000
architecturalstyletypeid             52250             99.866208
...                                    ...                   ...
buildingclassdesc                    52320            100.000000
heatingorsystemdesc                  18470             35.301988
storydesc                            52273             99.910168
typeconstructiondesc                 52244             99.854740
property_id                              0              0.000000

[68 rows x 2 columns]


### Takeaways 
* Can get rid of the id columns that were used for join. typeconstructionid, storytypeid, etc
* anything below 95% should be dropped
* actually after some research we are going to go with 64%
* want to investigate garagecarcnt, heatingorsystemdesc, airconditiontiontypeid, aircondtioningdesc


# Prepare

Remove any properties that are likely to be something other than single unit properties. (e.g. no duplexes, no land/lot, ...). There are multiple ways to estimate that a property is a single unit, and there is not a single "right" answer.



#### Takeaways 
* does not appear to be any of those properties in the dataframe
* pulled in only single family residential or inferred single family residential

### Create a function that will drop rows or columns based on the percent of values that are missing: 
`handle_missing_values(df, prop_required_column, prop_required_row)`

> The input:
* A dataframe
* A number between 0 and 1 that represents the proportion, for each column, of rows with non-missing values required to keep the column.   
   - i.e. if prop_required_column = .6, then you are requiring a column to have at least 60% of values not-NA (no more than 40% missing).
* A number between 0 and 1 that represents the proportion, for each row, of columns/variables with non-missing values required to keep the row.   
   - For example, if prop_required_row = .75, then you are requiring a row to have at least 75% of variables with a non-missing value (no more that 25% missing).
> The output:
* The dataframe with the columns and rows dropped as indicated. 
   - Be sure to drop the columns prior to the rows in your function.
   - hint: Look up the dropna documentation.
> You will want to compute a threshold from your input values (prop_required) and total number of rows or columns.


In [77]:
dfb = df.copy(deep=True)

In [85]:
def handle_missing_values(df, prop_required_columns=0.60, prop_required_row=0.75):
    threshold = int(round(prop_required_columns * len(df.index), 0))
    df = df.dropna(axis=1, thresh=threshold)
    threshold = int(round(prop_required_row * len(df.columns), 0))
    df = df.dropna(axis=0, thresh=threshold)

    return df

In [86]:
df = handle_missing_values(df)

In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52242 entries, 0 to 52440
Data columns (total 34 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   heatingorsystemtypeid         33850 non-null  float64
 1   parcelid                      52242 non-null  int64  
 2   propertylandusetypeid         52242 non-null  float64
 3   bathroomcnt                   52242 non-null  float64
 4   bedroomcnt                    52242 non-null  float64
 5   buildingqualitytypeid         33654 non-null  float64
 6   calculatedbathnbr             52181 non-null  float64
 7   calculatedfinishedsquarefeet  52234 non-null  float64
 8   finishedsquarefeet12          52070 non-null  float64
 9   fips                          52242 non-null  float64
 10  fullbathcnt                   52181 non-null  float64
 11  latitude                      52242 non-null  float64
 12  longitude                     52242 non-null  float64
 13  l

Encapsulate your work inside of functions in a wrangle_zillow.py module.