In [1]:
# conclusion
# did we answer the problem statement/purpose
# what can be improved in this process
# what are the flaws of my model

# Production Model and Conclusion

---

## Imports and Functions

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# to remove
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [3]:
def qual_to_num(data, feature):
    '''
    Takes in DataFrame and column name containing string ratings
    replaces the ratings with numerical values
    
    Returns the updated DataFrame
    '''
    
    # ratings that appear in the dataset
    # and corresponding numerical value
    qual_conversion = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0, np.nan: 0}
    
    # convert to ratings using dictionary
    data[feature] = data[feature].apply(lambda x: qual_conversion[x])
    
    return data



def vif_df(df):
    '''
    Takes in a dataframe of numeric columns and computes
    the Variance Inflation Factor (VIF) for each variable.
    
    Returns a nx1 DataFrame of VIF scores.
    '''
    
    corr_features = df.columns
    
    # creats a list of VIF values, cycling through variables
    # to select as a target for scoring
    vif_values = [variance_inflation_factor(df.values, i) for i in range(len(corr_features))]
    
    # index scores with column names and sort from greatest to least
    vif = pd.DataFrame(vif_values,
                   index = corr_features,
                   columns = ['vif']).sort_values(by = 'vif', ascending = False)
    
    return vif



def setup_test():
    '''
    Generates a cleaned and feature engineered test dataset
    that matches the general structure of the cleaned train data.
    
    Returns the cleaned, feature engineered test data set.
    '''
    
    test_data = pd.read_csv('../datasets/test.csv')
    
    # initial features that were selected for the train dataset 
    test_features = ['MS Zoning', 'Lot Area', 'Land Contour', 'Land Slope', 'Neighborhood', 'Condition 1',
            'Bldg Type', 'House Style', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add',
           'Mas Vnr Type', 'Exter Qual', 'Exter Cond', 'Bsmt Qual', 'Bsmt Cond',
           'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Kitchen Qual', 'Bedroom AbvGr',
           'Full Bath', 'Half Bath', 'Fireplaces', 'Fireplace Qu', 'BsmtFin SF 1', 'Bsmt Unf SF', '1st Flr SF', '2nd Flr SF', 'Sale Type',]
    
    
    test_data = test_data[test_features]
    
    # clean column names
    test_data.columns = [col.lower().replace(' ','_').replace('/','_') for col in test_data.columns]
    
    
    # converts descriptive rating to numerical
    test_data = qual_to_num(test_data, 'bsmt_qual')
    test_data = qual_to_num(test_data, 'bsmt_cond')
    test_data = qual_to_num(test_data, 'kitchen_qual')
    test_data = qual_to_num(test_data, 'exter_qual')
    test_data = qual_to_num(test_data, 'exter_cond')
    test_data = qual_to_num(test_data, 'fireplace_qu')
    
    # combines basment quality and condition ratings via multiplication
    test_data['bsmt_qual_cond'] = test_data['bsmt_qual'] * test_data['bsmt_cond']
    test_data.drop(columns = ['bsmt_qual', 'bsmt_cond'], inplace = True)
    
    test_data['qual_cond'] = test_data['overall_qual'] * test_data['overall_cond']
    test_data.drop(columns = ['overall_qual', 'overall_cond'], inplace = True)

    test_data['exter_qual_cond'] = test_data['exter_qual'] * test_data['exter_cond']
    test_data.drop(columns = ['exter_qual', 'exter_cond'], inplace = True)

    
    # combines fireplace count and quality rating via multiplication
    test_data['fireplaces_weighted'] = test_data['fireplaces'] * test_data['fireplace_qu']
    test_data.drop(columns = ['fireplaces', 'fireplace_qu'], inplace = True)
    
    # adds square feet (SF) measurements with weights:
    # 1*(Finished SF) + 0.5(Unfinished SF)
    test_data['bsmt_weighted_sf'] = test_data['bsmtfin_sf_1'] + 0.5*test_data['bsmt_unf_sf']
    test_data.drop(columns = ['bsmtfin_sf_1', 'bsmt_unf_sf'], inplace = True)
    
    # combines full bath and half bath into one column
    test_data['bath'] = test_data['full_bath'] + 0.5*test_data['half_bath']
    test_data.drop(columns = ['full_bath', 'half_bath'], inplace = True)
    
    #combines 1st floor and 2nd floor square feet areas
    test_data['sq_ft'] = test_data['1st_flr_sf'] + test_data['2nd_flr_sf']
    test_data.drop(columns = ['1st_flr_sf', '2nd_flr_sf'], inplace = True)
    
    # replace np.nan with most frequent value in the column 'None'
    test_data.loc[test_data['mas_vnr_type'].isna(), 'mas_vnr_type'] = 'None'
    
    return test_data


def dummify_train_test(train, numeric_cols, categ_cols):
    '''
    Takes a DataFrame with the desired features to model with,
    sets up a DataFrame of the test dataset with the same features,
    get_dummies is performed on both DataFrames, and then makes sure
    columns agree. If not, columns of zeros are added.
    
    Returns train and test DataFrames with desired features and
    categorical columns dummified and in identical order.
    '''
    
    # generate cleaned and feature engineered test dataframe
    test = setup_test()
    features = numeric_cols + categ_cols
    
    train_with_dummies = pd.get_dummies(train[features], columns = categ_cols, drop_first = True)
    test_with_dummies = pd.get_dummies(test[features], columns = categ_cols, drop_first = True)
    
    # save dummy column names from both train and test
    # to make sure both sets have identical column names/counts later
    both_columns_set = set(train_with_dummies.columns).union(set(test_with_dummies.columns))

    # for any column names not in set, add column with 0s
    for col in both_columns_set:
        if col not in train_with_dummies.columns:
            train_with_dummies[col] = 0
        if col not in test_with_dummies.columns:
            test_with_dummies[col] = 0
            
    # make sure column order agrees for train and test sets
    column_order = train_with_dummies.columns
    test_with_dummies = test_with_dummies[column_order]
    
    return train_with_dummies, test_with_dummies

## Production Model Choice

#### Rebuilding the Model
Let us quickly rebuild our production linear regression model (Model 4).

In [14]:
house = pd.read_csv('../datasets/cleaned_train.csv')

In [15]:
y = house['saleprice']

In [16]:
# select features
numeric_features = ['sq_ft', 'garage_area', 'bsmt_weighted_sf', 'year_built']
categ_features = ['neighborhood', 'house_style', 'kitchen_qual', 'qual_cond', 'exter_qual_cond']
features = numeric_features + categ_features

In [17]:
# get dummies on train and test datasets
# and make columns agree for modeling
train_dummy, test_dummy = dummify_train_test(house[features], numeric_features, categ_features)

In [18]:
# split training data into train and validation set
X_train, X_val, y_train, y_val = train_test_split(train_dummy, y, test_size = 0.3, random_state = 6)

In [19]:
# fit a linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# generate prediction sale prices on our train (yt_pred) and validation (yv_pred)
yt_pred = lr.predict(X_train)
yv_pred = lr.predict(X_val)

In [20]:
# print scores for our train split
print(f'Train Cross Validation: {cross_val_score(lr, X_train, y_train, cv = 5).mean()}\n')
print(f'Train R squared: {lr.score(X_train, y_train)}')
print(f'Train RMSE: {metrics.mean_squared_error(y_train, yt_pred, squared = False)}\n')

# print scores for our validation split
print(f'Validation R squared: {lr.score(X_val, y_val)}')
print(f'Validation RMSE: {metrics.mean_squared_error(y_val, yv_pred, squared = False)}')

Train Cross Validation: 0.8904960587650373

Train R squared: 0.914574607035238
Train RMSE: 23340.119254589645

Validation R squared: 0.887738294880876
Validation RMSE: 25406.482714142392


#### Justifying This Choice

We see a decent improvement in our R squared scores and our RMSE scores are lower compared to our first three linear models. As expected our model performed a little worse on our validation set than on the training set, but the performance gain is still worth being slightly overfit.

When we applied Ridge and Lasso models to the same features as the ones used here, our performance hardly improved. Because we did not have to scale our data to fit this model, we keep the interpretability of our coefficients. This is not the case for Ridge and Lasso. With the scores between the three models being so close, it makes the most sense to stick with our best linear regression model.

#### Model Inference

Since coefficient interpretability is a perk of this model, let us examine them.

To interpret the coefficients, we again need to identify which columns were dropped when we performed `get_dummies`.

In [21]:
# identifying columns get_dummies dropped
no_drop_dummies = pd.get_dummies(house[features], columns = categ_features)
set(no_drop_dummies.columns) - set(train_dummy)

{'exter_qual_cond_2',
 'house_style_1.5Fin',
 'neighborhood_Blmngtn',
 'qual_cond_1'}

Our above results tell use that all of the coefficients are relative to a 1.5-story home in Bloomington Heights, with a overall quality/condition rating of 1 and exterior quality/condition rating of 2. These represent the baseline for all of our coefficients.

Below, we see the coefficients which describe how much the price of a home with the baseline conditions changes if we increase the `year_built`, `sq_ft`, `bsmt_weighted_sf`, or the `garage_area` by one unit. We can see that a square foot of property is worth most in the main living space, least in the garage, and in between in the basement. We can also see the specific values.

In [28]:
# creating a list of model coefficients with labels
coef = pd.Series(lr.coef_, index = train_dummy.columns)

In [29]:
# sorted neighborhood coefficients
coef[:4].sort_values(ascending = False)

year_built          476.090864
sq_ft                61.117261
bsmt_weighted_sf     39.152619
garage_area          31.278997
dtype: float64

Below is a list of that allows us to compare home values relative to Bloomington Heights. We can clearly see the most expensive neighborhood according to our model is Green Hills, while the least expensive is Meadow Village.

In [24]:
# sorted neighborhood coefficients
coef[coef.index.str.contains('neighborhood')].sort_values(ascending = False)

neighborhood_GrnHill    89919.264084
neighborhood_StoneBr    44291.644347
neighborhood_Veenker    32616.857581
neighborhood_Crawfor    29326.138970
neighborhood_NoRidge    25548.281974
neighborhood_NridgHt    20132.195830
neighborhood_ClearCr    16557.749918
neighborhood_Timber     10934.985386
neighborhood_Somerst    10464.234044
neighborhood_SWISU       6629.850523
neighborhood_Gilbert     6096.141013
neighborhood_BrkSide     5956.337891
neighborhood_CollgCr     4747.443224
neighborhood_Sawyer       222.745616
neighborhood_SawyerW      197.197578
neighborhood_NAmes        -20.298748
neighborhood_Mitchel     -643.214810
neighborhood_IDOTRR     -2340.897408
neighborhood_Edwards    -3007.094249
neighborhood_NWAmes     -4436.321199
neighborhood_OldTown    -5265.375505
neighborhood_Blueste    -7130.563586
neighborhood_Greens     -7411.014719
neighborhood_NPkVill    -7843.510997
neighborhood_BrDale     -8893.005381
neighborhood_Landmrk   -11314.947598
neighborhood_MeadowV   -18823.255041
d

Next we can see that generally, more sizable homes had higher values. Weirdly, we see that homes with the second level unfinished seemed to be valued more than if it were finished. Maybe this is due to it being a blank slate and more readily designable to the new owners preference.

In [25]:
# sorted house style coefficients
coef[coef.index.str.contains('house_style')].sort_values(ascending = False)

house_style_2.5Unf    17338.024491
house_style_2.5Fin    12172.272016
house_style_SLvl      10465.559449
house_style_1.5Unf     9583.740654
house_style_SFoyer     6584.137918
house_style_1Story     6395.149133
house_style_2Story      565.607490
dtype: float64

As for quality and condition, generally the higher the quality and condition score, the more expensive the home was (with some exceptions). Based on the coefficients, these quality and condition values produced some of the largest differences in home prices.

In [26]:
# sorted 
coef[(coef.index.str.contains('qual_cond')) &
     (~coef.index.str.contains('exter'))].sort_values(ascending = False)

qual_cond_90    222660.792737
qual_cond_50    103403.025428
qual_cond_45     72950.171958
qual_cond_72     46055.562486
qual_cond_32     39487.132388
qual_cond_63     39017.058976
qual_cond_56     35911.573454
qual_cond_49     35261.558927
qual_cond_40     33177.678208
qual_cond_64     30825.760368
qual_cond_48     27834.320843
qual_cond_42     18161.129597
qual_cond_35     13952.388170
qual_cond_28     12822.179763
qual_cond_36     10904.106833
qual_cond_30      4603.476192
qual_cond_54      3188.259805
qual_cond_24      2836.846742
qual_cond_10      2308.225374
qual_cond_60         0.000000
qual_cond_20     -2052.282151
qual_cond_25     -3019.688605
qual_cond_9      -3268.595315
qual_cond_12     -3707.833187
qual_cond_8      -4634.262912
qual_cond_16    -14161.504711
qual_cond_18    -14597.007975
qual_cond_6     -17716.266574
qual_cond_15    -23536.940461
qual_cond_21    -26918.287806
qual_cond_4     -29321.762522
qual_cond_5     -37841.113200
qual_cond_3     -41801.912771
qual_cond_

In [27]:
# sorted exterior quality/condition coefficients
coef[coef.index.str.contains('exter')].sort_values(ascending = False)

exter_qual_cond_15    39897.166078
exter_qual_cond_16    23166.022635
exter_qual_cond_20    22100.697779
exter_qual_cond_12    18403.524164
exter_qual_cond_9     17645.405940
exter_qual_cond_6     12989.639271
exter_qual_cond_4      9567.301388
exter_qual_cond_3         0.000000
exter_qual_cond_8     -6706.066320
exter_qual_cond_25   -99222.577736
dtype: float64

---

## Conclusion