In [1]:
# imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import datetime

## Pre-Processing
* Are categorical variables one-hot encoded? *GET DUMMY*
* Does the student investigate or manufacture features with linear relationships to the target? *FEATURE ENGINEERING*
* Have the data been scaled appropriately?    *STANDARDSCALER*
* Does the student properly split and/or sample the data for validation/training purposes?
* Does the student utilize feature selection to remove noisy or multi-collinear features?
* Does the student test and evaluate a variety of models to identify a production algorithm (AT MINIMUM: linear regression, lasso, and ridge)?
* Does the student defend their choice of production model relevant to the data at hand and the problem?
* Does the student explain how the model works and evaluate its performance successes/downfalls?

In [2]:
# read in cleaned data

train_clean = pd.read_csv('../datasets/train_cleaned.csv')
test_clean = pd.read_csv('../datasets/test_cleaned.csv')

## Feature Engineering

In [3]:
train_clean['total_sqft'] = train_clean['Total Bsmt SF'] + train_clean['1st Flr SF'] + train_clean['2nd Flr SF']


# find year, regardless of when this notebook is run
today = datetime.datetime.now()

# age = current year - year built
train_clean['house_age'] =  today.year - train_clean['Year Built']

train_clean.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Lot Shape,Land Contour,Utilities,...,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice,total_sqft,house_age
0,109,533352170,60,RL,73.385027,13517.0,Pave,IR1,Lvl,AllPub,...,0.0,0.0,0.0,0.0,3.0,2010.0,WD,130500.0,2204.0,47.0
1,544,531379050,60,RL,43.0,11492.0,Pave,IR1,Lvl,AllPub,...,0.0,0.0,0.0,0.0,4.0,2009.0,WD,220000.0,3035.0,27.0
2,153,535304180,20,RL,68.0,7922.0,Pave,Reg,Lvl,AllPub,...,0.0,0.0,0.0,0.0,1.0,2010.0,WD,109000.0,2114.0,70.0
3,318,916386060,60,RL,73.0,9802.0,Pave,Reg,Lvl,AllPub,...,0.0,0.0,0.0,0.0,4.0,2010.0,WD,174000.0,1828.0,17.0
4,255,906425045,50,RL,82.0,14235.0,Pave,IR1,Lvl,AllPub,...,0.0,0.0,0.0,0.0,3.0,2010.0,WD,138500.0,2121.0,123.0


In [4]:
train_categorical = train_clean.select_dtypes(include='object').columns

In [5]:
## Encoding Categorical Variables

train_dummy = pd.get_dummies(data = train_clean, columns = train_categorical, drop_first=True)
pd.set_option('display.max_columns', 500)
train_dummy.head()

Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,TotRms AbvGrd,Fireplaces,Garage Yr Blt,Garage Cars,Garage Area,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,SalePrice,total_sqft,house_age,MS Zoning_C (all),MS Zoning_FV,MS Zoning_I (all),MS Zoning_RH,MS Zoning_RL,MS Zoning_RM,Street_Pave,Lot Shape_IR2,Lot Shape_IR3,Lot Shape_Reg,Land Contour_HLS,Land Contour_Low,Land Contour_Lvl,Utilities_NoSeWa,Utilities_NoSewr,Lot Config_CulDSac,Lot Config_FR2,Lot Config_FR3,Lot Config_Inside,Land Slope_Mod,Land Slope_Sev,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_Greens,Neighborhood_GrnHill,Neighborhood_IDOTRR,Neighborhood_Landmrk,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition 1_Feedr,Condition 1_Norm,Condition 1_PosA,Condition 1_PosN,Condition 1_RRAe,Condition 1_RRAn,Condition 1_RRNe,Condition 1_RRNn,Condition 2_Feedr,Condition 2_Norm,Condition 2_PosA,Condition 2_PosN,Condition 2_RRAe,Condition 2_RRAn,Condition 2_RRNn,Bldg Type_2fmCon,Bldg Type_Duplex,Bldg Type_Twnhs,Bldg Type_TwnhsE,House Style_1.5Unf,House Style_1Story,House Style_2.5Fin,House Style_2.5Unf,House Style_2Story,House Style_SFoyer,House Style_SLvl,Roof Style_Gable,Roof Style_Gambrel,Roof Style_Hip,Roof Style_Mansard,Roof Style_Shed,Roof Matl_CompShg,Roof Matl_Membran,Roof Matl_Tar&Grv,Roof Matl_WdShake,Roof Matl_WdShngl,Exterior 1st_AsphShn,Exterior 1st_BrkComm,Exterior 1st_BrkFace,Exterior 1st_CBlock,Exterior 1st_CemntBd,Exterior 1st_HdBoard,Exterior 1st_ImStucc,Exterior 1st_MetalSd,Exterior 1st_Plywood,Exterior 1st_Stone,Exterior 1st_Stucco,Exterior 1st_VinylSd,Exterior 1st_Wd Sdng,Exterior 1st_WdShing,Exterior 2nd_AsphShn,Exterior 2nd_Brk Cmn,Exterior 2nd_BrkFace,Exterior 2nd_CBlock,Exterior 2nd_CmentBd,Exterior 2nd_HdBoard,Exterior 2nd_ImStucc,Exterior 2nd_MetalSd,Exterior 2nd_Plywood,Exterior 2nd_Stone,Exterior 2nd_Stucco,Exterior 2nd_VinylSd,Exterior 2nd_Wd Sdng,Exterior 2nd_Wd Shng,Mas Vnr Type_BrkFace,Mas Vnr Type_None,Mas Vnr Type_Stone,Exter Qual_Fa,Exter Qual_Gd,Exter Qual_TA,Exter Cond_Fa,Exter Cond_Gd,Exter Cond_Po,Exter Cond_TA,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,Bsmt Qual_Fa,Bsmt Qual_Gd,Bsmt Qual_None,Bsmt Qual_Po,Bsmt Qual_TA,Bsmt Cond_Fa,Bsmt Cond_Gd,Bsmt Cond_None,Bsmt Cond_Po,Bsmt Cond_TA,Bsmt Exposure_Gd,Bsmt Exposure_Mn,Bsmt Exposure_No,Bsmt Exposure_None,BsmtFin Type 1_BLQ,BsmtFin Type 1_GLQ,BsmtFin Type 1_LwQ,BsmtFin Type 1_None,BsmtFin Type 1_Rec,BsmtFin Type 1_Unf,BsmtFin Type 2_BLQ,BsmtFin Type 2_GLQ,BsmtFin Type 2_LwQ,BsmtFin Type 2_None,BsmtFin Type 2_Rec,BsmtFin Type 2_Unf,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,Heating QC_Fa,Heating QC_Gd,Heating QC_Po,Heating QC_TA,Central Air_Y,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,Kitchen Qual_Fa,Kitchen Qual_Gd,Kitchen Qual_TA,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sal,Functional_Sev,Functional_Typ,Garage Type_Attchd,Garage Type_Basment,Garage Type_BuiltIn,Garage Type_CarPort,Garage Type_Detchd,Garage Type_None,Garage Finish_None,Garage Finish_RFn,Garage Finish_Unf,Garage Qual_Fa,Garage Qual_Gd,Garage Qual_None,Garage Qual_Po,Garage Qual_TA,Garage Cond_Fa,Garage Cond_Gd,Garage Cond_None,Garage Cond_Po,Garage Cond_TA,Paved Drive_P,Paved Drive_Y,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
0,109,533352170,60,73.385027,13517.0,6.0,8.0,1976.0,2005.0,289.0,533.0,0.0,192.0,725.0,725.0,754.0,0.0,1479.0,0.0,0.0,2.0,1.0,3.0,1.0,6.0,0.0,1976.0,2.0,475.0,0.0,44.0,0.0,0.0,0.0,0.0,0.0,3.0,2010.0,130500.0,2204.0,47.0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1
1,544,531379050,60,43.0,11492.0,7.0,5.0,1996.0,1997.0,132.0,637.0,0.0,276.0,913.0,913.0,1209.0,0.0,2122.0,1.0,0.0,2.0,1.0,4.0,1.0,8.0,1.0,1997.0,2.0,559.0,0.0,74.0,0.0,0.0,0.0,0.0,0.0,4.0,2009.0,220000.0,3035.0,27.0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1
2,153,535304180,20,68.0,7922.0,5.0,7.0,1953.0,2007.0,0.0,731.0,0.0,326.0,1057.0,1057.0,0.0,0.0,1057.0,1.0,0.0,1.0,0.0,3.0,1.0,5.0,0.0,1953.0,1.0,246.0,0.0,52.0,0.0,0.0,0.0,0.0,0.0,1.0,2010.0,109000.0,2114.0,70.0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1
3,318,916386060,60,73.0,9802.0,5.0,5.0,2006.0,2007.0,0.0,0.0,0.0,384.0,384.0,744.0,700.0,0.0,1444.0,0.0,0.0,2.0,1.0,3.0,1.0,7.0,0.0,2007.0,2.0,400.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2010.0,174000.0,1828.0,17.0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1
4,255,906425045,50,82.0,14235.0,6.0,8.0,1900.0,1993.0,0.0,0.0,0.0,676.0,676.0,831.0,614.0,0.0,1445.0,0.0,0.0,2.0,0.0,3.0,1.0,6.0,0.0,1957.0,2.0,484.0,0.0,59.0,0.0,0.0,0.0,0.0,0.0,3.0,2010.0,138500.0,2121.0,123.0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


## Modeling 

In [6]:
# split data set into features and target 
X = train_dummy[['Overall Qual', 'Gr Liv Area', 'Garage Area', '1st Flr SF', 'Full Bath', 'house_age', 'total_sqft']]
y = train_dummy['SalePrice']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [8]:
# Scale data appropriately
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [9]:
lr = LinearRegression()

In [10]:
# create a baseline
baseline_preds = np.full_like(y_test, y_train.mean())
print(f'Baseline R2 Score: {r2_score(y_test, baseline_preds)}')

Baseline R2 Score: -0.00043273813883448753


In [11]:
lr.fit(X_train_sc, y_train)

In [12]:
# cross validation
print(f'Cross Val Mean: {cross_val_score(lr, X_train, y_train).mean()}')

print(f'Train R2 Score: {lr.score(X_train_sc, y_train)}')
print(f'Test R2 Score: {lr.score(X_test_sc, y_test)}')

Cross Val Mean: 0.753896043348899
Train R2 Score: 0.7709670347545527
Test R2 Score: 0.835772447333432


In [13]:
train_preds = lr.predict(X_train_sc)
test_preds= lr.predict(X_test_sc)

In [14]:

print(f'shape of train_preds: {train_preds.shape}')
print(f'shape of test_preds: {test_preds.shape}')

shape of train_preds: (1538,)
shape of test_preds: (513,)


In [15]:
# Calculating RMSE
print(f'Train RMSE: {mean_squared_error(y_train, train_preds, squared=False)}')
print(f'Test RMSE: {mean_squared_error(y_test, test_preds, squared=False)}')
print(f'Baseline RMSE: {mean_squared_error(y_test, baseline_preds, squared=False)}')

Train RMSE: 38059.47478993422
Test RMSE: 31754.704136098735
Baseline RMSE: 78375.26238032707


In [16]:
# Ridge Regression
ridge = RidgeCV(alphas = np.logspace(0, 5, 100), cv=5)
ridge.fit(X_train_sc, y_train)
ridge_train_pred = ridge.predict(X_train_sc)
ridge_test_pred = ridge.predict(X_test_sc)

print(f'Train Ridge R2 score:{ridge.score(X_train_sc, y_train)}')
print(f'Test Ridge R2 score:{ridge.score(X_test_sc, y_test)}')

print(f'Training Data Ridge RMSE: {mean_squared_error(y_train, ridge_train_pred, squared=False)}')
print(f'Testing Data Ridge RMSE: {mean_squared_error(y_test, ridge_test_pred, squared=False)}')

Train Ridge R2 score:0.7703110519914026
Test Ridge R2 score:0.8352961557983822
Training Data Ridge RMSE: 38113.9396748693
Testing Data Ridge RMSE: 31800.71817703508


In [17]:
# Lasso Regression
lasso = LassoCV(alphas = np.logspace(0, 2, 30), cv=5)
lasso.fit(X_train_sc, y_train)
lasso_train_pred = lasso.predict(X_train_sc)
lasso_test_pred = lasso.predict(X_test_sc)


print(f'Train Lasso R2 score:{lasso.score(X_train_sc, y_train)}')
print(f'Test Lasso R2 score:{lasso.score(X_test_sc, y_test)}')

print(f'Training Data Lasso RMSE: {mean_squared_error(y_train, lasso_train_pred, squared=False)}')
print(f'Testing Data Lasso RMSE: {mean_squared_error(y_test, lasso_test_pred, squared=False)}')


Train Lasso R2 score:0.7709532461838564
Test Lasso R2 score:0.8354644358792491
Training Data Lasso RMSE: 38060.62042820512
Testing Data Lasso RMSE: 31784.468419608154


## Conclusion

*Out of the 3 models, regular, ridge, and lasso, the Ridge method seemed to perform the best. The RMSE and R2 scores of the Lasso and initial model are about the same, but not as good as the Ridge model. In general, our model performed better than the baseline model, but is not anywhere near the perfect model that we are striving for. Our test_score is not very close to our train_score which suggests that our model is vastly overfit and does not generalize well to new data. Therefore, until we improve upon these aspects, we cannot expect a good prediction for any given input for this model.*

From our modeling process, we found that the best predictors for price are the total sq-ft and garage size / features. Surprisingly, features such as the age of the home and amenities like the fireplace and quality of finishes did not play a large role. This suggests that some of the most valuable things for a prospective buyer is the functionality of the house rather than the aesthetic factor or comfort level.