## Modeling

In [39]:
import numpy as np
import pandas as pd
import wrangle
import explore

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.metrics import mean_squared_error, explained_variance_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
#Acquire the data
zillow = wrangle.wrangle_zillow()

In [3]:
zillow.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67783 entries, 0 to 77380
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   bathroom_count             67783 non-null  float64
 1   quality_type               67783 non-null  float64
 2   home_square_feet           67783 non-null  float64
 3   latitude                   67783 non-null  float64
 4   longitude                  67783 non-null  float64
 5   room_count                 67783 non-null  float64
 6   logerror                   67783 non-null  float64
 7   county                     67783 non-null  object 
 8   age                        67783 non-null  float64
 9   acres                      67783 non-null  float64
 10  tax_rate                   67783 non-null  float64
 11  structure_dollar_per_sqft  67783 non-null  float64
 12  land_dollar_per_sqft       67783 non-null  float64
 13  bath_bed_ratio             67783 non-null  flo

In [4]:
train, validate, test = wrangle.train_validate_test_split(zillow)
train.shape, validate.shape, test.shape

((37958, 14), (16268, 14), (13557, 14))

In [5]:
#Now make the clusters
#Create the list of dictionaries for the cluster groups
clusters = [
    {
        'name':'age_location_cluster',
        'k': 6,
        'features': ['age', 'latitude', 'longitude']
    },
    {
        'name':'size_cluster',
        'k': 5,
        'features': ['home_square_feet', 'bath_bed_ratio', 'acres']
    },
    {
        'name':'value_cluster',
        'k': 5,
        'features': ['structure_dollar_per_sqft', 'land_dollar_per_sqft']
    }]

In [6]:
train, validate, test = explore.get_clusters(train, validate, test, clusters)

In [7]:
train.shape, validate.shape, test.shape

((37958, 17), (16268, 17), (13557, 17))

In [8]:
#First, set the int categoricals to dtype 'object'
train[['age_location_cluster', 'size_cluster', 'value_cluster']] = train[['age_location_cluster', 'size_cluster', 'value_cluster']].astype('object')
validate[['age_location_cluster', 'size_cluster', 'value_cluster']] = validate[['age_location_cluster', 'size_cluster', 'value_cluster']].astype('object')
test[['age_location_cluster', 'size_cluster', 'value_cluster']] = test[['age_location_cluster', 'size_cluster', 'value_cluster']].astype('object')

#Get cols to create dummies for
cat_cols = ['county', 'age_location_cluster', 'size_cluster', 'value_cluster']

df_dummies = pd.get_dummies(train[cat_cols], dummy_na=False, drop_first=True)
train = pd.concat([train, df_dummies], axis = 1).drop(columns = cat_cols)

df_dummies = pd.get_dummies(validate[cat_cols], dummy_na=False, drop_first=True)
validate = pd.concat([validate, df_dummies], axis = 1).drop(columns = cat_cols)

df_dummies = pd.get_dummies(test[cat_cols], dummy_na=False, drop_first=True)
test = pd.concat([test, df_dummies], axis = 1).drop(columns = cat_cols)

In [9]:
train.shape, validate.shape, test.shape

((37958, 28), (16268, 28), (13557, 28))

In [10]:
validate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16268 entries, 43229 to 1245
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   bathroom_count             16268 non-null  float64
 1   quality_type               16268 non-null  float64
 2   home_square_feet           16268 non-null  float64
 3   latitude                   16268 non-null  float64
 4   longitude                  16268 non-null  float64
 5   room_count                 16268 non-null  float64
 6   logerror                   16268 non-null  float64
 7   age                        16268 non-null  float64
 8   acres                      16268 non-null  float64
 9   tax_rate                   16268 non-null  float64
 10  structure_dollar_per_sqft  16268 non-null  float64
 11  land_dollar_per_sqft       16268 non-null  float64
 12  bath_bed_ratio             16268 non-null  float64
 13  county_Orange              16268 non-null  

In [13]:
#Split into X and y groups
X_train, y_train = train.drop('logerror', axis = 1), train.logerror
X_validate, y_validate = validate.drop('logerror', axis = 1), validate.logerror
X_test, y_test = test.drop('logerror', axis = 1), test.logerror

In [14]:
#Now scale the X groups using the MinMax Scaler
def scale_data(X_train, X_validate, X_test):
    #Create the scaler
    scaler = MinMaxScaler()

    #Fit the scaler on X_train
    scaler.fit(X_train)

    #Transform the data
    X_train_scaled = scaler.transform(X_train)
    X_validate_scaled = scaler.transform(X_validate)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_validate_scaled, X_test_scaled

In [15]:
X_train_scaled, X_validate_scaled, X_test_scaled = scale_data(X_train, X_validate, X_test)

In [16]:
#Create the metric dataframe
metric_df = pd.DataFrame()

In [20]:
def make_metric_df(y, y_pred, model_name, metric_df):
    if metric_df.size ==0:
        metric_df = pd.DataFrame(data=[
            {
                'model': model_name, 
                'RMSE_validate': mean_squared_error(
                    y,
                    y_pred) ** .5,
                'r^2_validate': explained_variance_score(
                    y,
                    y_pred)
            }])
        return metric_df
    else:
        return metric_df.append(
            {
                'model': model_name, 
                'RMSE_validate': mean_squared_error(
                    y,
                    y_pred) ** .5,
                'r^2_validate': explained_variance_score(
                    y,
                    y_pred)
            }, ignore_index=True)

__Create the Baseline__

I will create the baseline using the mean of logerror.

In [17]:
#Get the baseline and print the RMSE
def get_baseline(y_train, y_validate, y_test, metric_df):
    #Change y_train and y_validate to be data frames so we can store the baseline values in them
    y_train = pd.DataFrame(y_train)
    y_validate = pd.DataFrame(y_validate)
    y_test = pd.DataFrame(y_test)

    #Calculate baseline based on mean
    baseline_mean_pred = y_train.logerror.mean()
    y_train['baseline_mean_pred'] = baseline_mean_pred
    y_validate['baseline_mean_pred'] = baseline_mean_pred
    y_test['baseline_mean_pred'] = baseline_mean_pred

    #Calculate RMSE based on mean
    train_RMSE = mean_squared_error(y_train.logerror, y_train['baseline_mean_pred']) ** .5
    validate_RMSE = mean_squared_error(y_validate.logerror, y_validate['baseline_mean_pred']) ** .5

    #Print RMSE
    print("RMSE using Mean\nTrain/In-Sample: ", round(train_RMSE, 2), 
        "\nValidate/Out-of-Sample: ", round(validate_RMSE, 2),
        "\n")

    metric_df = make_metric_df(y_validate.logerror, y_validate['baseline_mean_pred'], 'validate_baseline_mean', metric_df)

    return y_train, y_validate, y_test, metric_df

In [21]:
y_train, y_validate, y_test, metric_df = get_baseline(y_train, y_validate, y_test, metric_df)

RMSE using Mean
Train/In-Sample:  0.17 
Validate/Out-of-Sample:  0.15 



__Create OLS Model__

In [25]:
def get_ols_model(X_train_scaled, X_validate_scaled, y_train, y_validate, metric_df):
    #Create the model
    lm = LinearRegression(normalize = True)

    #Fit the model on scaled data
    lm.fit(X_train_scaled, y_train.logerror)

    #Make predictions
    y_train['lm_preds'] = lm.predict(X_train_scaled)
    y_validate['lm_preds'] = lm.predict(X_validate_scaled)

    #Calculate the RMSE
    train_RMSE = mean_squared_error(y_train.logerror, y_train['lm_preds']) ** .5
    validate_RMSE = mean_squared_error(y_validate.logerror, y_validate['lm_preds']) ** .5

    print("RMSE using OLS\nTrain/In-Sample: ", round(train_RMSE, 4), 
        "\nValidate/Out-of-Sample: ", round(validate_RMSE, 4))

    metric_df = make_metric_df(y_validate.logerror, y_validate['lm_preds'], 'validate_ols', metric_df)

    return lm, metric_df

In [26]:
lm, metric_df = get_ols_model(X_train_scaled, X_validate_scaled, y_train, y_validate, metric_df)

RMSE using OLS
Train/In-Sample:  0.17 
Validate/Out-of-Sample:  0.15


__Create LassoLars Models__

In [28]:
def get_lars_models(X_train_scaled, X_validate_scaled, y_train, y_validate, metric_df):
    #Create a list to hold all the different models
    lars_models = []

    #Loop through different alpha values. Start with 1.
    for i in range(1, 21):
        #Create the model
        lars = LassoLars(alpha = i)
        
        #Fit the model
        lars.fit(X_train_scaled, y_train.logerror)
        
        #Make predictions
        y_train[f'lars_alpha_{i}'] = lars.predict(X_train_scaled)
        y_validate[f'lars_alpha_{i}'] = lars.predict(X_validate_scaled)
        
        #Calculate RMSE
        train_RMSE = mean_squared_error(y_train.logerror, y_train[f'lars_alpha_{i}']) ** .5
        validate_RMSE = mean_squared_error(y_validate.logerror, y_validate[f'lars_alpha_{i}']) ** .5

        #Add model to list of lars models
        lars_models.append({f'lars_alpha_{i}': lars})
        
        print(f'\nRMSE using LassoLars, alpha = {i}')
        print("Train/In-Sample: ", round(train_RMSE, 4), 
        "\nValidate/Out-of-Sample: ", round(validate_RMSE, 4))

        metric_df = make_metric_df(y_validate.logerror, y_validate[f'lars_alpha_{i}'], f'validate_lars_alpha_{i}', metric_df)

    return lars_models, metric_df

In [29]:
lars_models, metric_df = get_lars_models(X_train_scaled, X_validate_scaled, y_train, y_validate, metric_df)


RMSE using LassoLars, alpha = 1
Train/In-Sample:  0.17 
Validate/Out-of-Sample:  0.15

RMSE using LassoLars, alpha = 2
Train/In-Sample:  0.17 
Validate/Out-of-Sample:  0.15

RMSE using LassoLars, alpha = 3
Train/In-Sample:  0.17 
Validate/Out-of-Sample:  0.15

RMSE using LassoLars, alpha = 4
Train/In-Sample:  0.17 
Validate/Out-of-Sample:  0.15

RMSE using LassoLars, alpha = 5
Train/In-Sample:  0.17 
Validate/Out-of-Sample:  0.15

RMSE using LassoLars, alpha = 6
Train/In-Sample:  0.17 
Validate/Out-of-Sample:  0.15

RMSE using LassoLars, alpha = 7
Train/In-Sample:  0.17 
Validate/Out-of-Sample:  0.15

RMSE using LassoLars, alpha = 8
Train/In-Sample:  0.17 
Validate/Out-of-Sample:  0.15

RMSE using LassoLars, alpha = 9
Train/In-Sample:  0.17 
Validate/Out-of-Sample:  0.15

RMSE using LassoLars, alpha = 10
Train/In-Sample:  0.17 
Validate/Out-of-Sample:  0.15

RMSE using LassoLars, alpha = 11
Train/In-Sample:  0.17 
Validate/Out-of-Sample:  0.15

RMSE using LassoLars, alpha = 12
Train/I

__Create GLM, Tweedie Regressor Models__

In [34]:
def get_glm_model(X_train_scaled, X_validate_scaled, y_train, y_validate, metric_df):
    #Create a list to hold all the models
    glm_models = []

    #Use a loop to try each power and several values for alpha
    for i in range(0, 3):
        if i == 1 or i == 2:
            continue
        else:
            #The following loop determines the alpha values
            for j in range(1, 11):
                #Create the model
                glm = TweedieRegressor(power = i, alpha = j)

                #Fit the model
                glm.fit(X_train_scaled, y_train.logerror)

                #Make predictions
                y_train[f'glm_power_{i}_alpha_{j}_preds'] = glm.predict(X_train_scaled)
                y_validate[f'glm_power_{i}_alpha_{j}_preds'] = glm.predict(X_validate_scaled)

                #Calculate RMSE
                train_RMSE = mean_squared_error(y_train.logerror, y_train[f'glm_power_{i}_alpha_{j}_preds']) ** .5
                validate_RMSE = mean_squared_error(y_validate.logerror, y_validate[f'glm_power_{i}_alpha_{j}_preds']) ** .5

                #Add model to the list
                glm_models.append({f'glm_{i}_{j}':glm})

                print(f'\nRMSE for Power = {i}, Alpha = {j}\n')
                print("Train/In-Sample: ", round(train_RMSE, 4), 
                    "\nValidate/Out-of-Sample: ", round(validate_RMSE, 4))

                metric_df = make_metric_df(y_validate.logerror, y_validate[f'glm_power_{i}_alpha_{j}_preds'], f'validate_glm_power_{i}_alpha_{j}', metric_df)

    return glm_models, metric_df

In [35]:
glm, metric_df = get_glm_model(X_train_scaled, X_validate_scaled, y_train, y_validate, metric_df)


RMSE for Power = 0, Alpha = 1

Train/In-Sample:  0.1684 
Validate/Out-of-Sample:  0.1537

RMSE for Power = 0, Alpha = 2

Train/In-Sample:  0.1684 
Validate/Out-of-Sample:  0.1537

RMSE for Power = 0, Alpha = 3

Train/In-Sample:  0.1684 
Validate/Out-of-Sample:  0.1538

RMSE for Power = 0, Alpha = 4

Train/In-Sample:  0.1684 
Validate/Out-of-Sample:  0.1538

RMSE for Power = 0, Alpha = 5

Train/In-Sample:  0.1684 
Validate/Out-of-Sample:  0.1538

RMSE for Power = 0, Alpha = 6

Train/In-Sample:  0.1685 
Validate/Out-of-Sample:  0.1538

RMSE for Power = 0, Alpha = 7

Train/In-Sample:  0.1685 
Validate/Out-of-Sample:  0.1538

RMSE for Power = 0, Alpha = 8

Train/In-Sample:  0.1685 
Validate/Out-of-Sample:  0.1538

RMSE for Power = 0, Alpha = 9

Train/In-Sample:  0.1685 
Validate/Out-of-Sample:  0.1538

RMSE for Power = 0, Alpha = 10

Train/In-Sample:  0.1685 
Validate/Out-of-Sample:  0.1538


In [38]:
metric_df

Unnamed: 0,model,RMSE_validate,r^2_validate
0,validate_baseline_mean,0.15379,0.0
1,validate_ols,0.153585,0.002691
2,validate_lars_alpha_1,0.15379,0.0
3,validate_lars_alpha_2,0.15379,0.0
4,validate_lars_alpha_3,0.15379,0.0
5,validate_lars_alpha_4,0.15379,0.0
6,validate_lars_alpha_5,0.15379,0.0
7,validate_lars_alpha_6,0.15379,0.0
8,validate_lars_alpha_7,0.15379,0.0
9,validate_lars_alpha_8,0.15379,0.0


__Create RandomForestRegressor Model__

In [None]:
#Starting from 2 in order to avoid warnings
for num in range(2, 11):
    #Now create a new loop that runs through different min_samples_leaf values
    for val in range(1, 26):
        #Instantiate new model
        clf = RandomForestRegressor(random_state = 123, max_depth = num, min_samples_leaf = val)

        #Fit the model
        clf.fit(X_train_scaled, y_train.logerror)

        #Score the model on training data
        train_score = clf.score(X_train, y_train)

        #Make predictions on validate data to use in confusion matrix
        clf_preds = clf.predict(X_validate)