In [1]:
# My created .py files for modularization
import env
import os
import wrangle as w
# Ignore Warning
import warnings
warnings.filterwarnings("ignore")
# Array and Dataframes
import numpy as np
import pandas as pd
# Imputer
from sklearn.impute import SimpleImputer
# Evaluation: Visualization
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
# Evaluation: Statistical Analysis
from scipy import stats
# Modeling
from sklearn.model_selection import GridSearchCV
# Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

In [2]:
# delimit columns showed for ease
pd.set_option('display.max_columns', None)

In [3]:
# defining dataframe from wrangle
zillow = w.zillow()

## Data Previews & Insight

In [4]:
# sample
zillow.sample(5)

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips,propertylandusetypeid
898480,3.0,1.0,1067.0,295000.0,1941.0,3859.68,6037.0,261.0
1606503,4.0,3.0,2883.0,551000.0,1987.0,7730.2,6037.0,261.0
1322480,3.0,1.5,1010.0,262570.0,1957.0,3149.86,6059.0,261.0
1294748,3.0,1.0,1138.0,338600.0,1925.0,4885.49,6037.0,261.0
157555,3.0,2.0,1137.0,335288.0,1955.0,4091.45,6037.0,261.0


In [5]:
# explore columns: nunique, unique, isna, dtype
w.summarize(zillow)

Unnamed: 0,Column Name,Number of Unique Values,Unique Values,Number of Null Values,dtype
7,propertylandusetypeid,1,[261.0],0,float64
6,fips,3,"[6037.0, 6059.0, 6111.0]",0,float64
0,bedroomcnt,19,"[0.0, 4.0, 3.0, 5.0, 2.0, 1.0, 6.0, 7.0, 8.0, ...",11,float64
1,bathroomcnt,38,"[0.0, 2.0, 4.0, 1.0, 2.5, 3.5, 3.0, 5.5, 4.5, ...",11,float64
4,yearbuilt,153,"[nan, 2005.0, 2011.0, 1926.0, 1972.0, 1973.0, ...",9337,float64
2,calculatedfinishedsquarefeet,10580,"[nan, 3633.0, 1620.0, 2077.0, 1200.0, 171.0, 2...",8484,float64
3,taxvaluedollarcnt,592269,"[27516.0, 10.0, 2108.0, 296425.0, 124.0, 84777...",493,float64
5,taxamount,918838,"[nan, 174.21, 6941.39, 10244.94, 7924.68, 8034...",4442,float64


In [7]:
# info
zillow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152863 entries, 0 to 2152862
Data columns (total 8 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   bedroomcnt                    float64
 1   bathroomcnt                   float64
 2   calculatedfinishedsquarefeet  float64
 3   taxvaluedollarcnt             float64
 4   yearbuilt                     float64
 5   taxamount                     float64
 6   fips                          float64
 7   propertylandusetypeid         float64
dtypes: float64(8)
memory usage: 131.4 MB


In [8]:
# describe
zillow.describe()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips,propertylandusetypeid
count,2152852.0,2152852.0,2144379.0,2152370.0,2143526.0,2148421.0,2152863.0,2152863.0
mean,3.287196,2.230688,1862.855,461896.2,1960.95,5634.866,6048.377,261.0
std,0.9547544,0.9992796,1222.125,699676.0,22.1622,8178.91,20.43329,0.0
min,0.0,0.0,1.0,1.0,1801.0,1.85,6037.0,261.0
25%,3.0,2.0,1257.0,188170.2,1949.0,2534.98,6037.0,261.0
50%,3.0,2.0,1623.0,327671.0,1958.0,4108.95,6037.0,261.0
75%,4.0,3.0,2208.0,534527.0,1976.0,6414.32,6059.0,261.0
max,25.0,32.0,952576.0,98428910.0,2016.0,1337756.0,6111.0,261.0


In [26]:
# checking amount of outliers
zillow[zillow.yearbuilt.between(1800,1900)]

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips,propertylandusetypeid
207,3.0,1.0,1319.0,114727.0,1890.0,1270.80,6111.0,261.0
3523,2.0,1.0,782.0,115971.0,1895.0,,6037.0,261.0
3549,2.0,1.0,840.0,393000.0,1885.0,4850.66,6037.0,261.0
3551,3.0,1.0,1674.0,418000.0,1887.0,5197.58,6037.0,261.0
3917,4.0,2.0,1488.0,226336.0,1900.0,2757.69,6037.0,261.0
...,...,...,...,...,...,...,...,...
2147922,2.0,2.0,1276.0,85584.0,1894.0,1459.52,6037.0,261.0
2148015,2.0,1.0,1270.0,284667.0,1895.0,3486.72,6037.0,261.0
2148041,3.0,2.0,1134.0,28737.0,1900.0,378.50,6037.0,261.0
2148840,3.0,1.0,1469.0,147329.0,1885.0,1974.96,6037.0,261.0


In [29]:
zillow[zillow.bedroomcnt.isnull()].drop(inplace=True)

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips,propertylandusetypeid
107763,,,,67366.0,1926.0,780.54,6059.0,261.0
118612,,,,43992.0,1946.0,541.64,6059.0,261.0
193993,,,1348.0,840698.0,1952.0,,6059.0,261.0
1141339,,,200.0,188972.0,,,6037.0,261.0
1324608,,,990.0,435000.0,1906.0,,6037.0,261.0
1442975,,,,273196.0,,,6037.0,261.0
1647346,,,400.0,28347.0,1954.0,,6037.0,261.0
1701026,,,,407930.0,1926.0,,6037.0,261.0
1722707,,,,477161.0,,,6037.0,261.0
1776422,,,,38855.0,,,6037.0,261.0
