In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import env

from sklearn.model_selection import train_test_split

In [2]:
def get_connection(db, user=env.user, host=env.host, password=env.password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

In [3]:
sql = '''
select parcelid, calculatedfinishedsquarefeet as square_feet, bedroomcnt as bedrooms, bathroomcnt as bathrooms, taxamount as taxes, taxvaluedollarcnt as tax_value, yearbuilt, regionidcounty as county, lotsizesquarefeet as lot_size
from properties_2017
join predictions_2017 using(parcelid)
where transactiondate between "2017-05-01" and "2017-06-30"
and unitcnt = 1;
'''

In [4]:
df = pd.read_sql(sql, get_connection('zillow'))

In [5]:
df.head()

Unnamed: 0,parcelid,square_feet,bedrooms,bathrooms,taxes,tax_value,yearbuilt,county,lot_size
0,11289917,1458.0,3.0,2.0,2319.9,136104.0,1970.0,3101.0,8284.0
1,11705026,1421.0,2.0,1.0,543.69,35606.0,1911.0,3101.0,6707.0
2,11389003,1650.0,3.0,2.0,7673.19,614000.0,1949.0,3101.0,7300.0
3,11967869,693.0,2.0,1.0,3267.47,274237.0,1921.0,3101.0,2908.0
4,12035176,812.0,1.0,1.0,2926.19,245906.0,1966.0,3101.0,93226.0


In [6]:
df.shape

(13309, 9)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13309 entries, 0 to 13308
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   parcelid     13309 non-null  int64  
 1   square_feet  13308 non-null  float64
 2   bedrooms     13309 non-null  float64
 3   bathrooms    13309 non-null  float64
 4   taxes        13308 non-null  float64
 5   tax_value    13309 non-null  float64
 6   yearbuilt    13303 non-null  float64
 7   county       13309 non-null  float64
 8   lot_size     12976 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 935.9 KB


In [9]:
print(df.isnull().sum())

parcelid         0
square_feet      1
bedrooms         0
bathrooms        0
taxes            1
tax_value        0
yearbuilt        6
county           0
lot_size       333
dtype: int64


In [10]:
print(df.columns[df.isnull().any()])

Index(['square_feet', 'taxes', 'yearbuilt', 'lot_size'], dtype='object')


In [8]:
df.to_csv("zillow.csv") 

In [11]:
df = df.dropna()

In [12]:
df.shape

(12968, 9)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12968 entries, 0 to 13308
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   parcelid     12968 non-null  int64  
 1   square_feet  12968 non-null  float64
 2   bedrooms     12968 non-null  float64
 3   bathrooms    12968 non-null  float64
 4   taxes        12968 non-null  float64
 5   tax_value    12968 non-null  float64
 6   yearbuilt    12968 non-null  float64
 7   county       12968 non-null  float64
 8   lot_size     12968 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 1013.1 KB


In [14]:
print(df.isnull().sum())

parcelid       0
square_feet    0
bedrooms       0
bathrooms      0
taxes          0
tax_value      0
yearbuilt      0
county         0
lot_size       0
dtype: int64


In [15]:
df.set_index("parcelid")

Unnamed: 0_level_0,square_feet,bedrooms,bathrooms,taxes,tax_value,yearbuilt,county,lot_size
parcelid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
11289917,1458.0,3.0,2.0,2319.90,136104.0,1970.0,3101.0,8284.0
11705026,1421.0,2.0,1.0,543.69,35606.0,1911.0,3101.0,6707.0
11389003,1650.0,3.0,2.0,7673.19,614000.0,1949.0,3101.0,7300.0
11967869,693.0,2.0,1.0,3267.47,274237.0,1921.0,3101.0,2908.0
12035176,812.0,1.0,1.0,2926.19,245906.0,1966.0,3101.0,93226.0
...,...,...,...,...,...,...,...,...
12832732,1740.0,3.0,3.0,5192.45,436000.0,1978.0,3101.0,1733.0
12945108,1536.0,3.0,2.0,3519.78,297097.0,1955.0,3101.0,8818.0
11464823,2305.0,4.0,3.0,6996.21,579047.0,1949.0,3101.0,5742.0
11534364,1997.0,3.0,2.0,7587.79,630933.0,1966.0,3101.0,73857.0


In [16]:
def wrangle_zillow():
    data = pd.read_csv("zillow.csv")
    
    data = data.set_index("parcelid")
    
    data = data.dropna()
    # remove all NaN values
    
    return data

In [17]:
def split(df, stratify_by=None):
    """
    Crude train, validate, test split
    To stratify, send in a column name
    """
    
    if stratify_by == None:
        train, test = train_test_split(df, test_size=.2, random_state=319)
        train, validate = train_test_split(train, test_size=.3, random_state=319)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=319, stratify=df[stratify_by])
        train, validate = train_test_split(train, test_size=.3, random_state=319, stratify=train[stratify_by])
    
    return train, validate, test

In [18]:
df2 = wrangle_zillow()

In [19]:
df.head()

Unnamed: 0,parcelid,square_feet,bedrooms,bathrooms,taxes,tax_value,yearbuilt,county,lot_size
0,11289917,1458.0,3.0,2.0,2319.9,136104.0,1970.0,3101.0,8284.0
1,11705026,1421.0,2.0,1.0,543.69,35606.0,1911.0,3101.0,6707.0
2,11389003,1650.0,3.0,2.0,7673.19,614000.0,1949.0,3101.0,7300.0
3,11967869,693.0,2.0,1.0,3267.47,274237.0,1921.0,3101.0,2908.0
4,12035176,812.0,1.0,1.0,2926.19,245906.0,1966.0,3101.0,93226.0


In [21]:
df2.shape

(12968, 9)

In [22]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12968 entries, 11289917 to 12942897
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   12968 non-null  int64  
 1   square_feet  12968 non-null  float64
 2   bedrooms     12968 non-null  float64
 3   bathrooms    12968 non-null  float64
 4   taxes        12968 non-null  float64
 5   tax_value    12968 non-null  float64
 6   yearbuilt    12968 non-null  float64
 7   county       12968 non-null  float64
 8   lot_size     12968 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 1013.1 KB


In [23]:
df2.describe()

Unnamed: 0.1,Unnamed: 0,square_feet,bedrooms,bathrooms,taxes,tax_value,yearbuilt,county,lot_size
count,12968.0,12968.0,12968.0,12968.0,12968.0,12968.0,12968.0,12968.0,12968.0
mean,6649.067474,1716.235734,2.973782,2.24599,6072.324209,483162.6,1964.021823,3100.72008,37298.06
std,3839.958292,969.886945,1.009826,0.99607,8367.906469,714350.7,23.58551,22.539194,109163.0
min,0.0,242.0,0.0,0.0,120.84,10504.0,1878.0,1286.0,736.0
25%,3321.75,1144.0,2.0,2.0,2600.9025,186379.5,1950.0,3101.0,6000.0
50%,6653.5,1483.5,3.0,2.0,4303.09,330480.0,1961.0,3101.0,7620.0
75%,9967.25,1995.25,4.0,3.0,6774.1675,543744.8,1983.0,3101.0,17324.25
max,13308.0,35640.0,11.0,11.0,276797.83,23858370.0,2015.0,3101.0,3589145.0


In [24]:
train, validate, test = split(df)

In [25]:
train.shape

(7261, 9)

In [27]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7261 entries, 6196 to 11571
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   parcelid     7261 non-null   int64  
 1   square_feet  7261 non-null   float64
 2   bedrooms     7261 non-null   float64
 3   bathrooms    7261 non-null   float64
 4   taxes        7261 non-null   float64
 5   tax_value    7261 non-null   float64
 6   yearbuilt    7261 non-null   float64
 7   county       7261 non-null   float64
 8   lot_size     7261 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 567.3 KB


In [28]:
validate.shape

(3113, 9)

In [29]:
test.shape

(2594, 9)