In [26]:
# Math 
from math import sqrt
from scipy import stats
import statistics
import os

# General
import numpy as np
import pandas as pd

# Sklearn Modeling
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.preprocessing import MinMaxScaler
import sklearn.preprocessing

# Visuals
import matplotlib.pyplot as plt
import seaborn as sns
from graphviz import Graph
from tabulate import tabulate

# Custom Module's

from wrangle import wrangle_zillow_modeling, wrangle_zillow_exploration
import explore
import acquire

#import evaluate
import warnings
warnings.filterwarnings("ignore")

In [27]:
df = acquire.acquire_zillow_second()

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28124 entries, 0 to 28123
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   area       28077 non-null  float64
 1   bedrooms   28124 non-null  float64
 2   bathrooms  28124 non-null  float64
 3   tax_value  28123 non-null  float64
 4   has_pool   6130 non-null   float64
 5   year       28053 non-null  float64
dtypes: float64(6)
memory usage: 1.5 MB


In [29]:
df.has_pool = df.has_pool.fillna(0)

In [30]:
df.has_pool.value_counts()

0.0    21994
1.0     6130
Name: has_pool, dtype: int64

In [31]:
df = df.replace(r'^\s*$', np.nan, regex=True)
df = df.dropna()

In [32]:
df.isnull().sum()

area         0
bedrooms     0
bathrooms    0
tax_value    0
has_pool     0
year         0
dtype: int64

In [41]:
len(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24319 entries, 0 to 28123
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   area       24319 non-null  float64
 1   bedrooms   24319 non-null  float64
 2   bathrooms  24319 non-null  float64
 3   tax_value  24319 non-null  float64
 4   has_pool   24319 non-null  float64
 5   year       24319 non-null  float64
dtypes: float64(6)
memory usage: 1.3 MB


In [34]:
def remove_outliers(df, k, col_list):
    ''' remove outliers from a list of columns in a dataframe 
        and return that dataframe
    '''
    
    for col in col_list:

        q1, q3 = df[col].quantile([.25, .75])  # get quartiles
        
        iqr = q3 - q1   # calculate interquartile range
        
        upper_bound = q3 + k * iqr   # get upper bound
        lower_bound = q1 - k * iqr   # get lower bound

        # return dataframe without outliers
        
        df = df[(df[col] > lower_bound) & (df[col] < upper_bound)]
        
    return df

In [35]:
df = remove_outliers(df, 1.5, ['bedrooms', 'bathrooms', 'area', 'tax_value', 'year'])

In [36]:
len(df)

24319

In [37]:
def add_scaled_columns(train, validate, test, scaler, columns_to_scale):
    
    # new column names
    new_column_names = [c + '_scaled' for c in columns_to_scale]
    
    # Fit the scaler on the train
    scaler.fit(train[columns_to_scale])
    
    # transform train validate and test
    train = pd.concat([
        train,
        pd.DataFrame(scaler.transform(train[columns_to_scale]), columns=new_column_names, index=train.index),
    ], axis=1)
    
    validate = pd.concat([
        validate,
        pd.DataFrame(scaler.transform(validate[columns_to_scale]), columns=new_column_names, index=validate.index),
    ], axis=1)
    
    
    test = pd.concat([
        test,
        pd.DataFrame(scaler.transform(test[columns_to_scale]), columns=new_column_names, index=test.index),
    ], axis=1)
    
    return train, validate, test

In [38]:
train_validate, test = train_test_split(df, test_size=.2, random_state=123)
train, validate = train_test_split(train_validate, test_size=.3, random_state=123)
columns_to_scale = ['bedrooms', 'bathrooms', 'area', 'has_pool', 'year']
train, validate, test = add_scaled_columns(train, validate, test, MinMaxScaler(), columns_to_scale)

In [42]:
train.head()

Unnamed: 0,area,bedrooms,bathrooms,tax_value,has_pool,year,bedrooms_scaled,bathrooms_scaled,area_scaled,has_pool_scaled,year_scaled
21748,2300.0,4.0,2.0,440420.0,0.0,1968.0,0.666667,0.333333,0.580729,0.0,0.54717
11733,1146.0,2.0,1.0,353000.0,0.0,1929.0,0.0,0.0,0.205078,0.0,0.179245
13684,1429.0,2.0,2.0,600515.0,0.0,1938.0,0.0,0.333333,0.297201,0.0,0.264151
2829,1216.0,2.0,2.0,228120.0,0.0,1951.0,0.0,0.333333,0.227865,0.0,0.386792
17299,2572.0,4.0,3.0,557044.0,0.0,2000.0,0.666667,0.666667,0.669271,0.0,0.849057
