In [1]:
# Math 
from math import sqrt
from scipy import stats
import statistics
import os

# General
import numpy as np
import pandas as pd
from pydataset import data

# Sklearn tools
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.impute import SimpleImputer

# Sklearn Metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score


# Sklearn Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.cluster import KMeans


# Visuals
import matplotlib.pyplot as plt
import seaborn as sns
from graphviz import Graph
from tabulate import tabulate
from sklearn.tree import export_graphviz

# Custom Module's
from wrangle import wrangle_zillow
import functions

import warnings
warnings.filterwarnings("ignore")

In [2]:
pd.set_option("display.max_rows", None, "display.max_columns", None) 

pd.reset_option("display.max_rows", "display.max_columns")

In [3]:
train_i, validate_i, test_i = wrangle_zillow('intial_explore', 1.5)

In [4]:
train_i.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29182 entries, 47145 to 43370
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   bath          29182 non-null  float64
 1   bed           29182 non-null  float64
 2   area          29182 non-null  float64
 3   lat           29182 non-null  float64
 4   long          29182 non-null  float64
 5   lot_size      29182 non-null  float64
 6   year          29182 non-null  float64
 7   tax_value     29182 non-null  float64
 8   tax_amount    29182 non-null  float64
 9   logerror      29182 non-null  float64
 10  heating_type  29182 non-null  object 
 11  county        29182 non-null  object 
dtypes: float64(10), object(2)
memory usage: 2.9+ MB


In [5]:
train_e, validate_e, test_e = wrangle_zillow('explore', 1.5)
train_e.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22404 entries, 67375 to 55455
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   bath          22404 non-null  float64
 1   bed           22404 non-null  float64
 2   area          22404 non-null  float64
 3   lat           22404 non-null  float64
 4   long          22404 non-null  float64
 5   lot_size      22404 non-null  float64
 6   year          22404 non-null  float64
 7   tax_value     22404 non-null  float64
 8   tax_amount    22404 non-null  float64
 9   logerror      22404 non-null  float64
 10  heating_type  22404 non-null  object 
 11  county        22404 non-null  object 
dtypes: float64(10), object(2)
memory usage: 2.2+ MB


In [6]:
train_c, validate_c, test_c = wrangle_zillow('cluster', 1.5)

In [7]:
train_c.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22404 entries, 67375 to 55455
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   bath               22404 non-null  float64
 1   bed                22404 non-null  float64
 2   area               22404 non-null  float64
 3   lat                22404 non-null  float64
 4   long               22404 non-null  float64
 5   lot_size           22404 non-null  float64
 6   year               22404 non-null  float64
 7   tax_value          22404 non-null  float64
 8   tax_amount         22404 non-null  float64
 9   logerror           22404 non-null  float64
 10  heating_type       22404 non-null  object 
 11  county             22404 non-null  object 
 12  bath_scaled        22404 non-null  float64
 13  bed_scaled         22404 non-null  float64
 14  area_scaled        22404 non-null  float64
 15  lot_size_scaled    22404 non-null  float64
 16  year_scaled       

In [9]:
train_c.head()

Unnamed: 0,bath,bed,area,lat,long,lot_size,year,tax_value,tax_amount,logerror,heating_type,county,bath_scaled,bed_scaled,area_scaled,lot_size_scaled,year_scaled,tax_amount_scaled,tax_value_scaled
67375,2.0,3.0,1791.0,34713217.0,-118157507.0,5749.0,2002.0,213201.0,3402.19,0.046593,Central,Los Angeles,0.333333,0.333333,0.481068,0.396864,0.882353,0.271661,0.2047
19846,2.0,5.0,2263.0,34019402.0,-118099292.0,7299.0,1953.0,98298.0,1883.93,0.073582,Floor/Wall,Los Angeles,0.333333,1.0,0.619607,0.535753,0.401961,0.148651,0.08938
75163,1.0,2.0,1031.0,34188645.0,-118407498.0,5500.0,1942.0,58145.0,812.76,-0.002468,Floor/Wall,Los Angeles,0.0,0.0,0.257998,0.374552,0.294118,0.061865,0.049081
69280,2.0,4.0,1263.0,33761335.0,-118017620.0,7668.0,1958.0,51243.0,1090.98,-0.109975,Central,Orange,0.333333,0.666667,0.326093,0.568817,0.45098,0.084407,0.042154
65321,1.5,3.0,1099.0,33626482.0,-117667989.0,9366.0,1972.0,566927.0,5860.22,0.029126,Central,Orange,0.166667,0.333333,0.277957,0.720968,0.588235,0.47081,0.559711
