## Scratchpad

In [1]:
import env
import zillow_wrangle as zw
import pandas as pd
import numpy as np
import os

In [2]:
def get_connection(db, user=env.username, host=env.hostname, password=env.password):
    connection_info = f'mysql+pymysql://{user}:{password}@{host}/{db}'
    return connection_info

In [3]:
def get_zillow_data():
    '''
    Function to retrieve the 2017 Zillow Property Data Set from CODEUP's mySQL Server
    '''
    if os.path.isfile('2017_zillow_hot_month_properties.csv'):
        df = pd.read_csv('2017_zillow_hot_month_properties.csv', index_col=0)  # If csv file exists read in data from csv file.
    else:
        sql = '''
                SELECT bedroomcnt, bathroomcnt, 
                    calculatedfinishedsquarefeet, 
                    taxvaluedollarcnt, yearbuilt, 
                    taxamount, fips, regionidzip 
                FROM properties_2017
                JOIN predictions_2017 USING(id)
                WHERE propertylandusetypeid = 261
                    AND transactiondate BETWEEN '2017-05-01' AND '2017-09-01';
                '''   # SQL query
                                                    
        db = 'zillow'                                   # Database name
        df = pd.read_sql(sql, get_connection(db))       # Pandas DataFrame
        df.to_csv('2017_zillow_hot_month_properties.csv')         # Cache Data
    return df

In [4]:
def get_zipcode_data():
    '''
    Function to retrieve the 2016 Zillow average zipcode Data from CODEUP's mySQL Server
    '''
    if os.path.isfile('2016_zillow_zipcodes.csv'):
        df = pd.read_csv('2016_zillow_zipcodes.csv', index_col=0)  # If csv file exists read in data from csv file.
    else:
        sql = '''
                SELECT COUNT(regionidzip) AS zipcode_count, 
                    regionidzip AS zipcode, 
                    ROUND(AVG(taxvaluedollarcnt),0) AS zipcode_avg_price
                FROM properties_2016
                WHERE propertylandusetypeid = 261
                GROUP BY regionidzip
                ORDER BY AVG(taxvaluedollarcnt) DESC;'''   # SQL query
                                                    
        db = 'zillow'                                   # Database name
        df = pd.read_sql(sql, get_connection(db))       # Pandas DataFrame
        df.to_csv('2016_zillow_zipcodes.csv')         # Cache Data
    return df

In [5]:
zillow = get_zillow_data()
zillow.head()

  mask |= (ar1 == a)


Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
0,0.0,0.0,,27516.0,,,6037.0
1,0.0,0.0,,10.0,,,6037.0
2,0.0,0.0,,10.0,,,6037.0
3,0.0,0.0,,2108.0,,174.21,6037.0
4,4.0,2.0,3633.0,296425.0,2005.0,6941.39,6037.0


In [6]:
zillow.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2152863 entries, 0 to 2152862
Data columns (total 7 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   bedroomcnt                    float64
 1   bathroomcnt                   float64
 2   calculatedfinishedsquarefeet  float64
 3   taxvaluedollarcnt             float64
 4   yearbuilt                     float64
 5   taxamount                     float64
 6   fips                          float64
dtypes: float64(7)
memory usage: 131.4 MB


In [7]:
zillow = zillow.replace(r'^\s*$', np.nan, regex=True)
zillow = zillow.dropna()
zillow.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2140235 entries, 4 to 2152862
Data columns (total 7 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   bedroomcnt                    float64
 1   bathroomcnt                   float64
 2   calculatedfinishedsquarefeet  float64
 3   taxvaluedollarcnt             float64
 4   yearbuilt                     float64
 5   taxamount                     float64
 6   fips                          float64
dtypes: float64(7)
memory usage: 130.6 MB


In [8]:
zips = get_zipcode_data()
zips.head()

Unnamed: 0,zipcode_count,zipcode,zipcode_avg_price
0,7103,96086.0,2847596.0
1,2856,96975.0,2768183.0
2,3102,96058.0,2607519.0
3,5625,96116.0,2186823.0
4,7490,96030.0,1822090.0


In [9]:
zips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 399 entries, 0 to 398
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   zipcode_count      399 non-null    int64  
 1   zipcode            398 non-null    float64
 2   zipcode_avg_price  399 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 12.5 KB


In [10]:
zips.isnull().sum()

zipcode_count        0
zipcode              1
zipcode_avg_price    0
dtype: int64

In [11]:
zips = zips.replace(r'^\s*$', np.nan, regex=True)

In [12]:
zips.isnull().sum()

zipcode_count        0
zipcode              1
zipcode_avg_price    0
dtype: int64

In [13]:
zips = zips.dropna()
zips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 398 entries, 0 to 398
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   zipcode_count      398 non-null    int64  
 1   zipcode            398 non-null    float64
 2   zipcode_avg_price  398 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 12.4 KB


In [14]:
zillow = zillow.rename(columns = {'bedroomcnt': 'bedrooms',
                             'bathroomcnt': 'bathrooms',
                             'calculatedfinishedsquarefeet': 'sqft',
                             'taxvaluedollarcnt': 'tax_value',
                             'taxamount': 'tax_amount',
                             'yearbuilt': 'year_built',
                             'regionidzip': 'zipcode'})

zillow = zillow.replace(r'^\s*$', np.nan, regex=True) # Format nulls
zillow = zillow.dropna()    # drop nulls

# Change bedroom count, year built, calculated finished squarefeet, and fips value type to int
zillow.bedrooms = zillow.bedrooms.astype('int64')
zillow.sqft = zillow.sqft.astype('int64')
zillow.year_built = zillow.year_built.astype('int64')
zillow.fips = zillow.fips.astype('int64')
zillow.zipcode = zillow.zipcode.astype('int64')

AttributeError: 'DataFrame' object has no attribute 'zipcode'

In [None]:
zillow.head()

In [None]:
zips = zips.astype('int64')
zips.head()

In [None]:
# Combining DFs
zillow = zillow.merge(zips, left_on='zipcode', right_on='zipcode', how='outer', indicator=True)

In [None]:
zillow.info()

In [None]:
zillow = zillow.dropna()

In [None]:
zillow.info()

In [None]:
zillow_test = zw.wrangle_zillow()
zillow_test.info()

In [None]:
zillow.head(1)

In [None]:
zillow_test.head(1)