In [34]:
# imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import datetime

## Pre-Processing
* Are categorical variables one-hot encoded? *GET DUMMY*
* Does the student investigate or manufacture features with linear relationships to the target? *FEATURE ENGINEERING*
* Have the data been scaled appropriately?    *STANDARDSCALER*
* Does the student properly split and/or sample the data for validation/training purposes?
* Does the student utilize feature selection to remove noisy or multi-collinear features?
* Does the student test and evaluate a variety of models to identify a production algorithm (AT MINIMUM: linear regression, lasso, and ridge)?
* Does the student defend their choice of production model relevant to the data at hand and the problem?
* Does the student explain how the model works and evaluate its performance successes/downfalls?

In [2]:
# read in cleaned data

train_clean = pd.read_csv('../datasets/train_cleaned.csv')
test_clean = pd.read_csv('../datasets/test_cleaned.csv')

## Feature Engineering

In [3]:
train_clean['total_sqft'] = train_clean['Total Bsmt SF'] + train_clean['1st Flr SF'] + train_clean['2nd Flr SF']


# find year, regardless of when this notebook is run
today = datetime.datetime.now()

# age = current year - year built
train_clean['house_age'] =  today.year - train_clean['Year Built']

train_clean.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Lot Shape,Land Contour,Utilities,...,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice,total_sqft,house_age
0,109,533352170.0,60,RL,0.0,13517.0,Pave,IR1,Lvl,AllPub,...,0.0,0.0,0.0,0.0,3.0,2010.0,WD,130500.0,2204.0,47.0
1,544,531379050.0,60,RL,43.0,11492.0,Pave,IR1,Lvl,AllPub,...,0.0,0.0,0.0,0.0,4.0,2009.0,WD,220000.0,3035.0,27.0
2,153,535304180.0,20,RL,68.0,7922.0,Pave,Reg,Lvl,AllPub,...,0.0,0.0,0.0,0.0,1.0,2010.0,WD,109000.0,2114.0,70.0
3,318,916386060.0,60,RL,73.0,9802.0,Pave,Reg,Lvl,AllPub,...,0.0,0.0,0.0,0.0,4.0,2010.0,WD,174000.0,1828.0,17.0
4,255,906425045.0,50,RL,82.0,14235.0,Pave,IR1,Lvl,AllPub,...,0.0,0.0,0.0,0.0,3.0,2010.0,WD,138500.0,2121.0,123.0


In [4]:
train_categorical = train_clean.select_dtypes(include='object').columns

In [5]:
## Encoding Categorical Variables

train_dummy = pd.get_dummies(data = train_clean, columns = train_categorical, drop_first=True)
train_dummy.head()

Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,...,Paved Drive_P,Paved Drive_Y,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
0,109,533352170.0,60,0.0,13517.0,6.0,8.0,1976.0,2005.0,289.0,...,0,1,0,0,0,0,0,0,0,1
1,544,531379050.0,60,43.0,11492.0,7.0,5.0,1996.0,1997.0,132.0,...,0,1,0,0,0,0,0,0,0,1
2,153,535304180.0,20,68.0,7922.0,5.0,7.0,1953.0,2007.0,0.0,...,0,1,0,0,0,0,0,0,0,1
3,318,916386060.0,60,73.0,9802.0,5.0,5.0,2006.0,2007.0,0.0,...,0,1,0,0,0,0,0,0,0,1
4,255,906425045.0,50,82.0,14235.0,6.0,8.0,1900.0,1993.0,0.0,...,0,0,0,0,0,0,0,0,0,1


## Modeling 

In [6]:
# split data set into features and target 
X = train_dummy.drop('SalePrice', axis=1)
y = train_dummy['SalePrice']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [25]:
# Scale data appropriately
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [26]:
lr = LinearRegression()

In [27]:
# cross validation
cross_val_score(lr, X_train, y_train).mean()

0.6894728058791808

In [28]:
# create a baseline
baseline_preds = np.full_like(y_test, y_train.mean())
print(f'Baseline R2 Score: {r2_score(y_test, baseline_preds)}')

Baseline R2 Score: -0.00043273813883448753


In [29]:
lr.fit(X_train_sc, y_train)

In [30]:
print(f'Train R2 Score: {lr.score(X_train_sc, y_train)}')
print(f'Test R2 Score: {lr.score(X_test_sc, y_test)}')

Train R2 Score: 0.9299398982557906
Test R2 Score: -3.883690071695184e+21


In [41]:
test_preds= lr.predict(X_test_sc)

In [39]:
# Calculating RMSE
print(f'Train RMSE: {mean_squared_error(y_train, test_preds, squared=False)}')
print(f'Test RMSE: {mean_squared_error(y_test, y_train_preds, squared=False)}')
print(f'Baseline RMSE: {mean_squared_error(y_test, baseline_preds, squared=False)}')

ValueError: Found input variables with inconsistent numbers of samples: [1538, 513]

In [37]:
# Ridge Regression
ridge = RidgeCV(alphas=5,cv=5)
ridge.fit(X_train_sc, y_train)
ridge_train_pred = ridge.predict(X_train_sc)
ridge_test_pred = ridge.predict(X_test_sc)


print(f'Train Ridge R2 score:{ridge.score(X_train_sc, y_train)}')
print(f'Test Ridge R2 score:{ridge.score(X_test_sc, y_test)}')

print(f'Training Data Ridge RMSE: {mean_squared_error(y_train, ridge_train_pred, squared=False)}')
print(f'Testing Data Ridge RMSE: {mean_squared_error(y_test, ridge_test_pred, squared=False)}')


TypeError: len() of unsized object

In [None]:
# Lasso Regression
lasso = LassoCV(alphas = 5)
lasso.fit(X_train_sc, y_train)
lasso_train_pred = lasso.predict(X_train_sc)
lasso_test_pred = lasso.predict(X_test_sc)


print(f'Train Lasso R2 score:{lasso_cv.score(X_train_sc, y_train)}')
print(f'Test Lasso R2 score:{lasso_cv.score(X_test_sc, y_test)}')

print(f'Training Data Lasso RMSE: {mean_squared_error(y_train, lasso_train_pred, squared=False)}')
print(f'Testing Data Lasso RMSE: {mean_squared_error(y_test, lasso_test_pred, squared=False)}')


## Conclusion

*Out of the 3 models, regular, ridge, and lasso, the Ridge method seemed to perform the best. The RMSE and R2 scores of the Lasso and initial model are about the same, but not as good as the Ridge model. In general, our model performed better than the baseline model, but is not anywhere near the perfect model that we are striving for. Our test_score is not very close to our train_score which suggests that our model is vastly overfit and does not generalize well to new data. Therefore, until we improve upon these aspects, we cannot expect a good prediction for any given input for this model.*

From our modeling process, we found that the best predictors for price are the total sq-ft and garage size / features. Surprisingly, features such as the age of the home and amenities like the fireplace and quality of finishes did not play a large role. This suggests that some of the most valuable things for a prospective buyer is the functionality of the house rather than the aesthetic factor or comfort level.