### Imports and utils

In [156]:
# data, maipulation and plotting 
import pandas as pd
import numpy as np

import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

# data transformation and modeling preperation
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# modeling and evaluation
from sklearn import linear_model, metrics
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeCV

import warnings
warnings.filterwarnings('ignore')

### Util functions

Let's build a few utility functions we can use later on in the notebook.
1. Correlation matrix and heatmap generator for evaluating correlation between numerical features in our dataset.
2. Model evaluation functions to help us more cleanly evaluate our models consistently.

In [157]:
def create_heatmap(data, target_variable):
  # Generate correlation matrix
  data_corr = data.corr()

  # Plot heatmap of correlation matrix
  fig = go.Figure()
  fig.add_trace(
      go.Heatmap(
          x=data_corr.columns,
          y=data_corr.columns,
          z=np.array(data_corr),
      )
  )

  fig.update_layout(width=1000, title=f'Correlation heat map of target feature {target_variable} and predictor features')
  fig.update_xaxes(tickangle=45)

  fig.show(text_auto=True)

In [158]:
# building re-usable evaluation functions for our linear, lasso and ridge models

def r_squared(y, y_pred):
  return(f'R2 is {r2_score(y, y_pred)}')

def mse(y, y_pred):
  return(f'Mean squared error is {mean_squared_error(y, y_pred)}')

def eval(linear_model, X_train, X_test, y_train, y_test, dataset_type):
  y_pred_train = linear_model.predict(X_train)
  y_pred_test = linear_model.predict(X_test)  

  train_metrics = []
  test_metrics =  []

  if dataset_type == 'train':

    print("----Training set evaluation----")
    r_2_train = r_squared(y_pred_train, y_train)
    train_metrics.append(r_2_train)
    print(r_2_train)

    mse_train = mse(y_pred_train, y_train)
    train_metrics.append(mse_train)
    print(mse_train)

  if dataset_type == 'test':
    print("\n----Test set evaluation----")
    r_2_test = r_squared(y_pred_test, y_test)
    test_metrics.append(r_2_test)
    print(r_2_test)

    mse_test = mse(y_pred_test, y_test)
    test_metrics.append(mse_test)
    print(mse_test)

### Data understanding and exploration

#### Data description

While we'll get into our dataset shortly, it's important to review the data dictionary provided. This will help us understand what data we can remove right away or data we might want to keep later on.

In [159]:
data_description = open("data_description.txt")
data_description = data_description.read()
print(data_description)

MSSubClass: Identifies the type of dwelling involved in the sale.	

        20	1-STORY 1946 & NEWER ALL STYLES
        30	1-STORY 1945 & OLDER
        40	1-STORY W/FINISHED ATTIC ALL AGES
        45	1-1/2 STORY - UNFINISHED ALL AGES
        50	1-1/2 STORY FINISHED ALL AGES
        60	2-STORY 1946 & NEWER
        70	2-STORY 1945 & OLDER
        75	2-1/2 STORY ALL AGES
        80	SPLIT OR MULTI-LEVEL
        85	SPLIT FOYER
        90	DUPLEX - ALL STYLES AND AGES
       120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
       150	1-1/2 STORY PUD - ALL AGES
       160	2-STORY PUD - 1946 & NEWER
       180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
       190	2 FAMILY CONVERSION - ALL STYLES AND AGES

MSZoning: Identifies the general zoning classification of the sale.
		
       A	Agriculture
       C	Commercial
       FV	Floating Village Residential
       I	Industrial
       RH	Residential High Density
       RL	Residential Low Density
       RP	Residential Low Density Park 
       RM

#### Ingesting and viewing data statistics

In [160]:
# reading in our housing data
data = pd.read_csv('train.csv')

In [161]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [162]:
data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [163]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

It seems like we have quite a few objects in our data - including numerical data that _should_ be categorical in nature. Let's go ahead and isolate the numerical data for a quick correlation analysis.

In [164]:
numeric_data = data.select_dtypes(include=['float64', 'int64'])

In [165]:
numeric_data

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000
5,6,50,85.0,14115,5,5,1993,1995,0.0,732,...,40,30,0,320,0,0,700,10,2009,143000
6,7,20,75.0,10084,8,5,2004,2005,186.0,1369,...,255,57,0,0,0,0,0,8,2007,307000
7,8,60,,10382,7,6,1973,1973,240.0,859,...,235,204,228,0,0,0,350,11,2009,200000
8,9,50,51.0,6120,7,5,1931,1950,0.0,0,...,90,0,205,0,0,0,0,4,2008,129900
9,10,190,50.0,7420,5,6,1939,1950,0.0,851,...,0,4,0,0,0,0,0,1,2008,118000


In [166]:
create_heatmap(numeric_data, 'SalePrice')

Our correlation matrix heatmap looks consistent with what we'd expect in real world data. There is a lot of noise and there are not very many clear predictor features that jump out as us. Let's go ahead and start our feature transformation. As noted before, we have quite a few objects that need dummy variables and numerical features that need to be transformed into categorical variables. 

### Data transformation and preparation

Let's start by transforming the MSSubClass feature from numerical values into original feature name values. This will be helpful so that we can _then_ transform these values into dummy variables that each data point can have. It's likely this feature is important to our predictions later on.

#### MSSubClass

In [167]:
sub_class_mapping = {
      20:'1-STORY 1946 & NEWER ALL STYLES',
      30:'1-STORY 1945 & OLDER',
      40:'1-STORY W/FINISHED ATTIC ALL AGES',
      45:'1-1/2 STORY - UNFINISHED ALL AGES',
      50:'1-1/2 STORY FINISHED ALL AGES',
      60:'2-STORY 1946 & NEWER',
      70:'2-STORY 1945 & OLDER',
      75:'2-1/2 STORY ALL AGES',
      80:'SPLIT OR MULTI-LEVEL',
      85:'SPLIT FOYER',
      90:'DUPLEX - ALL STYLES AND AGES',
      120:'1-STORY PUD (Planned Unit Development) - 1946 & NEWER',
      150:'1-1/2 STORY PUD - ALL AGES',
      160:'2-STORY PUD - 1946 & NEWER',
      180:'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER',
      190:'2 FAMILY CONVERSION - ALL STYLES AND AGES'
      }

In [168]:
# replacing existing values in the MSSubClass feature with our mapped values
data = data.replace({'MSSubClass': sub_class_mapping})

In [169]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,2-STORY 1946 & NEWER,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,1-STORY 1946 & NEWER ALL STYLES,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,2-STORY 1946 & NEWER,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,2-STORY 1945 & OLDER,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,2-STORY 1946 & NEWER,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


Now that we've updated the only numerical feature that needed to become categorical, we can transform all our categorical or object based features into dummy variables. This is important because our regression models will need to take in numerical data.

In [170]:
# categorical data
data_categorical = data.select_dtypes(include=['object'])

In [171]:
data_categorical

Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,2-STORY 1946 & NEWER,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,1-STORY 1946 & NEWER ALL STYLES,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,2-STORY 1946 & NEWER,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,2-STORY 1945 & OLDER,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,2-STORY 1946 & NEWER,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
5,1-1/2 STORY FINISHED ALL AGES,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Mitchel,...,Attchd,Unf,TA,TA,Y,,MnPrv,Shed,WD,Normal
6,1-STORY 1946 & NEWER ALL STYLES,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Somerst,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
7,2-STORY 1946 & NEWER,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NWAmes,...,Attchd,RFn,TA,TA,Y,,,Shed,WD,Normal
8,1-1/2 STORY FINISHED ALL AGES,RM,Pave,,Reg,Lvl,AllPub,Inside,Gtl,OldTown,...,Detchd,Unf,Fa,TA,Y,,,,WD,Abnorml
9,2 FAMILY CONVERSION - ALL STYLES AND AGES,RL,Pave,,Reg,Lvl,AllPub,Corner,Gtl,BrkSide,...,Attchd,RFn,Gd,TA,Y,,,,WD,Normal


Lets get our dummy variables from the categorical features

In [172]:
categorical_dummy_features = pd.get_dummies(data_categorical, drop_first=True)

In [173]:
categorical_dummy_features.head()

Unnamed: 0,MSSubClass_1-1/2 STORY FINISHED ALL AGES,MSSubClass_1-STORY 1945 & OLDER,MSSubClass_1-STORY 1946 & NEWER ALL STYLES,MSSubClass_1-STORY PUD (Planned Unit Development) - 1946 & NEWER,MSSubClass_1-STORY W/FINISHED ATTIC ALL AGES,MSSubClass_2 FAMILY CONVERSION - ALL STYLES AND AGES,MSSubClass_2-1/2 STORY ALL AGES,MSSubClass_2-STORY 1945 & OLDER,MSSubClass_2-STORY 1946 & NEWER,MSSubClass_2-STORY PUD - 1946 & NEWER,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0


Now that we have a data frame of dummy features for our categorical variables, we can drop the originals from our dataset and concat the new dummy features.

In [174]:
# drop categorical variables
data = data.drop(list(data_categorical.columns), axis=1)

# concat dummy variables with X
data = pd.concat([data, categorical_dummy_features], axis=1)

Here is a list of columns we now have in our dataset. Quite a few!

In [175]:
list(data.columns)

['Id',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SalePrice',
 'MSSubClass_1-1/2 STORY FINISHED ALL AGES',
 'MSSubClass_1-STORY 1945 & OLDER',
 'MSSubClass_1-STORY 1946 & NEWER ALL STYLES',
 'MSSubClass_1-STORY PUD (Planned Unit Development) - 1946 & NEWER',
 'MSSubClass_1-STORY W/FINISHED ATTIC ALL AGES',
 'MSSubClass_2 FAMILY CONVERSION - ALL STYLES AND AGES',
 'MSSubClass_2-1/2 STORY ALL AGES',
 'MSSubClass_2-STORY 1945 & OLDER',
 'MSSubClass_2-STORY 1946 & NEWER',
 'MSSubClass_2-STORY PUD - 1946 & NEWER',
 'MSS

As we continue working with our data, let's create two seperate variables for the predictor features and our target variable. 

In [176]:
# seperating target feature and predictor features. I'm also going to be removing the 'ID' column because we already have an index. 
X = data.drop(['SalePrice','Id'] , 1)
y = data['SalePrice'] 

In [177]:
# replacing the N/A values in the dataset with the median values of that column. 
X.fillna(X.median(), inplace=True)

In [178]:
X.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,65.0,8450,7,5,2003,2003,196.0,706,0,150,...,0,0,0,0,1,0,0,0,1,0
1,80.0,9600,6,8,1976,1976,0.0,978,0,284,...,0,0,0,0,1,0,0,0,1,0
2,68.0,11250,7,5,2001,2002,162.0,486,0,434,...,0,0,0,0,1,0,0,0,1,0
3,60.0,9550,7,5,1915,1970,0.0,216,0,540,...,0,0,0,0,1,0,0,0,0,0
4,84.0,14260,8,5,2000,2000,350.0,655,0,490,...,0,0,0,0,1,0,0,0,1,0


Splitting our data is essential. We want to ensure that we're only training on our train data and testing on our test data.

In [179]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.7,
                                                    test_size=0.3, random_state=100)

Scaling our data is necessary for our Ridge and Lasso analysis, so let's fir a scaler to ONLY our training data and apply the transformation to our test data for later evaluation>

In [180]:
# before we're able to model our data, we need to scale it appropriately. We can fit the scaler on the training data.
scaler = StandardScaler().fit(X_train)

In [181]:
# lets now transform that scalar to both our training and test data
cols = X.columns

X_train = pd.DataFrame(scaler.transform(X_train))
X_train.columns = cols
X_test = pd.DataFrame(scaler.transform(X_test))
X_test.columns = cols

### Linear model

In our case, we know that a linear model is likely to undereprform Lasso and Ridge regression. That said, it's important to incldue it here as a baseline we wish to improve upon.

In [182]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

LinearRegression()

In [183]:
eval(linear_model, X_train, X_test, y_train, y_test, dataset_type='train')
eval(linear_model, X_train, X_test, y_train, y_test, dataset_type='test')

----Training set evaluation----
R2 is 0.9480888552000144
Mean squared error is 308453980.295084

----Test set evaluation----
R2 is -7.591989223154627e-05
Mean squared error is 1.3072199261519717e+32


## Ridge regression

Ridge regression is an extension of linear regression where the cost function is modified to minimize model compexity. This is done by adding a penalty parameter. Let's go ahead set our model up. 

We're also going to use grid search to identify the best value of 'alpha' for our ridge model. Cross validation is a resampling technique that uses different portions of the data to test and train a model on each iteration. We'll choose to include 6 folds.

In [None]:
ridge = Ridge()

params = {'alpha': list(range(1,500))}

grid_search_model = GridSearchCV(estimator=ridge,
                                 param_grid=params,
                                 scoring='r2',
                                 cv=6)

grid_search_model.fit(X_train, y_train)

# getting the best alpha value
ridge_alpha = grid_search_model.best_params_['alpha']

print(f"Alpha choosen for ridge model is: {ridge_alpha}")

We can now use the best alpha determined from our grid search to fit and evaluation a new ridge model

In [None]:
ridge = Ridge(alpha=ridge_alpha)
ridge.fit(X_train, y_train)

In [None]:
eval(ridge, X_train, X_test, y_train, y_test, dataset_type='train')
eval(ridge, X_train, X_test, y_train, y_test, dataset_type='test')

## Laso

A Lasso model uses regression methods for a more accurate prediction. It is a model that uses shrinkage - which is where the data values are shrunk towards a center point.

In [None]:
lasso = Lasso()

params = {'alpha': list(range(1,500))}

grid_search_model = GridSearchCV(estimator=lasso,
                                 param_grid=params,
                                 scoring='r2',
                                 cv=6)

grid_search_model.fit(X_train, y_train)

# Saving the best hyperparameter alpha
lasso_alpha = grid_search_model.best_params_['alpha']
print(f"Alpha choosen for lasso model is: {lasso_alpha}")

We can now use the best alpha determined from our grid search to fit and evaluation a new ridge model

In [None]:
lasso = Lasso(alpha=lasso_alpha)
lasso.fit(X_train, y_train)

In [None]:
eval(lasso, X_train, X_test, y_train, y_test, dataset_type='train')
eval(lasso, X_train, X_test, y_train, y_test, dataset_type='test')

In [None]:
def compare_models():
  print("Linear model model evaluation")
  eval(linear_model, X_train, X_test, y_train, y_test, dataset_type='test')

  print("\nRidge model model evaluation")
  eval(ridge, X_train, X_test, y_train, y_test, dataset_type='test')

  print("\nLaso model model evaluation")
  eval(lasso, X_train, X_test, y_train, y_test, dataset_type='test')

We can now compare each of our models performance on the test set.

In [None]:
compare_models()

Now that we've looked at our models performances, let's look at the coefficients of each model. This will help us determine what are the most important features for each model.

In [None]:
coefficients = pd.DataFrame(index=X_test.columns)

In [None]:
coefficients.index = X.columns

In [None]:
coefficients['Linear model coefficients'] = linear_model.coef_
coefficients['Ridge model coefficients'] = ridge.coef_
coefficients['Lasso model coefficients'] = lasso.coef_

### Summary

We can review each models coefficients in a single table. We'll later use this to pull out feature importance from each model.

In [None]:
coefficients

#### Evaluating feature importance

Coefficients are great, but let's look at the largest coefficients for each model. Do we see that the features of importance are the same across Lasso and Ridge models?

In [None]:
def get_important_features(model_name):
  features_importance = coefficients[model_name].sort_values(ascending=False)
  return features_importance.head()

In [None]:
get_important_features('Ridge model coefficients')

In [None]:
get_important_features('Lasso model coefficients')

# Summary

We've identified for our modeling and feature selection techniques, our Ridge model is the most performant on training and test sets. This is contrary to intuition but it does look like our Lasso model did outperform Ridge on the training set. More work is needed to identify an adjustment, but we can see that our Ridge model does perform adequately against our test set. We also see that it performed better than our baseline linear regression model. Although test performance is not extremely high, it would aid any business in determining what price to re-sell a house for.

The most important features an analysis can use are below. I've included the coefficients so that an analysis can know to _what_ extend these features are important compared to one another.

1. GrLivArea: 10094.828816
2. OverallQual: 9369.226077
3. 1stFlrSF: 7093.794628
4. TotalBsmtSF: 6357.671763
5. Neighborhood_NoRidge: 6019.516204

## Section for code to aid in question/answers

### What happens if we double alpha values for ridge and lasso models?

In [None]:
# doubling alpha score for lasso
lasso = Lasso(alpha= 2 * lasso_alpha)
lasso.fit(X_train, y_train)
lasso_evaluation_double_alpha = eval(lasso, X_train, X_test, y_train, y_test, dataset_type='test')

In [None]:
# doubling alpha score for ridge
ridge = Ridge(alpha= 2 * ridge_alpha)
ridge.fit(X_train, y_train)
ridge_evaluation_double_alpha = eval(ridge, X_train, X_test, y_train, y_test, dataset_type='test')

### If we had to drop most important features, what other features would be most important?

In [None]:
important_features = list(get_important_features('Ridge model coefficients').index)
important_features

In [None]:
X_train_dropped_important_features = X_train.drop(important_features, axis=1)
X_test_dropped_important_features = X_test.drop(important_features, axis=1)

In [None]:
lasso = Lasso()

params = {'alpha': list(range(1,300))}

grid_search_model = GridSearchCV(estimator=lasso,
                                 param_grid=params,
                                 scoring='r2',
                                 cv=6)

grid_search_model.fit(X_train_dropped_important_features, y_train)

# Saving the best hyperparameter alpha
lasso_alpha = grid_search_model.best_params_['alpha']
print(f"Alpha choosen for lasso model is: {lasso_alpha}")

In [None]:
lasso = Lasso(alpha=lasso_alpha)
lasso.fit(X_train_dropped_important_features, y_train)

In [None]:
eval(lasso, X_train_dropped_important_features, X_test_dropped_important_features, y_train, y_test, dataset_type='train')
eval(lasso, X_train_dropped_important_features, X_test_dropped_important_features, y_train, y_test, dataset_type='test')

In [None]:
ridge = Ridge()

params = {'alpha': list(range(1,300))}

grid_search_model = GridSearchCV(estimator=ridge,
                                 param_grid=params,
                                 scoring='r2',
                                 cv=6)

grid_search_model.fit(X_train_dropped_important_features, y_train)

# Saving the best hyperparameter alpha
ridge_alpha = grid_search_model.best_params_['alpha']
print(f"Alpha choosen for ridge model is: {ridge_alpha}")

In [None]:
ridge = Ridge(alpha=ridge_alpha)
ridge.fit(X_train_dropped_important_features, y_train)

In [None]:
eval(ridge, X_train_dropped_important_features, X_test_dropped_important_features, y_train, y_test, dataset_type='train')
eval(lasso, X_train_dropped_important_features, X_test_dropped_important_features, y_train, y_test, dataset_type='test')

In [None]:
coefficients = pd.DataFrame(index=X_test_dropped_important_features.columns)

In [None]:
coefficients.index = X_test_dropped_important_features.columns

In [None]:
coefficients['Ridge model coefficients'] = ridge.coef_
coefficients['Lasso model coefficients'] = lasso.coef_

In [None]:
def get_important_features(model_name):
  features_importance = coefficients[model_name].sort_values(ascending=False)
  return features_importance.head()

In [None]:
get_important_features('Ridge model coefficients')

In [None]:
get_important_features('Lasso model coefficients')