# Modeling

Several linear regression models were tested using different variables to determine which factors have the biggest role in determining the price of a home. Lasso and ridge regularization methods are used to simplify the models.

In [146]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [147]:
# read in the clean test and train datasets
df_train = pd.read_csv('../data/train_clean.csv')
df_train.set_index('Id', inplace=True)

df_test = pd.read_csv('../data/test_clean.csv')
df_test.set_index('Id', inplace=True)

In [148]:
# check the shape of df_train
df_train.shape

(2049, 79)

In [149]:
# check the shape of df_test
df_test.shape

(878, 77)

In [150]:
# Instantiate our linear regression object: 
lr = LinearRegression()

## Model 1: Classic Linear Regression

In [185]:
# Create X and y for train and test datasets
# This model used singular features and polynomial features

features2 = ['overall qual gr liv area',
            'overall qual garage area',
            'gr liv area garage area',
            'gr liv area',
            'overall qual', 
             'garage cars', 
             'total bsmt sf', 
             '1st flr sf', 
             'bsmt qual', 
             'year built', 'year remod/add', 'full bath', 'totrms abvgrd', 'mas vnr area',
             'fireplaces', 'heating qc', 'bsmtfin sf 1', 'open porch sf', 'wood deck sf', 'lot area']

X_train2 = df_train[features2]
y_train2 = df_train['saleprice']

In [186]:
# Scale X_train2
ss = StandardScaler()
ss.fit(X_train2)
X_train2_sc = ss.fit_transform(X_train2)

In [187]:
X_train2_sc.shape

(2049, 20)

In [188]:
y_train2.shape

(2049,)

In [189]:
# Get cross val scores
cross_val_score(lr, X_train2_sc, y_train2, cv=5)

array([0.84410647, 0.8804128 , 0.75744251, 0.88234151, 0.77208004])

In [190]:
# Fit the model with training data
lr.fit(X_train2_sc, y_train2)

# Get the training score
lr.score(X_train2_sc, y_train2)

0.847106715851607

In [191]:
# Generate predictions for x_test_sc
pred_2 = lr.predict(X_train2_sc)
pred_2

array([176770.51944367, 239441.85852332, 119366.9795495 , ...,
       165227.0897411 , 119520.4779878 , 201326.92337135])

In [192]:
# Calculate RMSE 
np.sqrt(mean_squared_error(y_train2, pred_2))

30986.4361387324

**Explanation:** This model produces an RMSE score of 30,986 on the training data and 31,789 on the testing data. These scores are much improved from previous iterations of my model that took in fewer features.

RMSE represents the level of error in units of the predictor, so we want it to be low. In previous model iterations (see model #2) the training data set was performing better than the testing data set. This indicated the model wasn't generalizing well to new, unseen data and suffering from high variance. I attempted to address this with regularization, which would significantly reduce the variance of the model, without substantial increase in its bias. 

## Model 2: Classic Linear Regression

In [96]:
# Create X and y for train and test datasets
# this model only used singular features, no polynomial or other features

features = ['overall qual', 'exter qual', 'gr liv area', 'kitchen qual', 'garage area',
           'garage cars', 'total bsmt sf', '1st flr sf', 'bsmt qual', 'year built']

X_train = df_train[features]
y_train = df_train['saleprice']

In [97]:
# Scale X_train and then use it to transform X_train
ss.fit(X_train)
X_train_sc = ss.fit_transform(X_train)

In [98]:
X_train.shape

(2051, 10)

In [99]:
y_train.shape

(2051,)

In [101]:
# Get cross val scores
cross_val_score(lr, X_train_sc, y_train, cv=5)

array([0.82082659, 0.8484504 , 0.74483482, 0.84357336, 0.76325471])

In [102]:
# Fit the model with training data
lr.fit(X_train_sc, y_train)

# Get the training score
lr.score(X_train_sc, y_train)

0.8090747428098075

In [103]:
# Generate predictions for x_test_sc
pred_1 = lr.predict(X_train_sc)
pred_1

array([179730.90520274, 238789.38402226, 129556.05791921, ...,
       170327.10251186, 113724.23043415, 202156.19317004])

In [104]:
# Calculate RMSE 
np.sqrt(mean_squared_error(y_train, pred_1))

34623.62333804828

## Model 3: Ridge Regression

This model applies ridge regression to model #1.

In [193]:
# Instantiate.
ridge_model=Ridge(alpha=10)

In [194]:
# Set up a list of ridge alphas to check.
# Generates 100 values equally between 0 and 5,
# then converts them to alphas between 10^0 and 10^5.
r_alphas = np.logspace(0, 5, 100)

# Cross-validate over our list of ridge alphas.
ridge_model = RidgeCV(alphas=r_alphas, scoring='r2', cv=5)

# Fit model using best ridge alpha!
ridge_model = ridge_model.fit(X_train2_sc, y_train2)

In [195]:
# Here is the optimal value of alpha
ridge_model.alpha_

1.0

In [196]:
# Evaluate model using R2.
print(ridge_model.score(X_train2_sc, y_train2))

0.84708078832552


In [197]:
ridge_model.coef_

array([ 58085.70199595,  57094.92159054, -54664.13503405,   5850.41999794,
       -21031.75956748,   3186.71148543,   -530.81779733,   6042.59674559,
         5230.24395624,   4390.32206144,   4748.15963889,    262.3792611 ,
          587.02750398,   3739.16530564,   5156.41634916,   4416.82386488,
         7460.81460371,   -111.82514063,   2310.34594037,   5177.85011251])

## Model 4: Lasso

This model applies ridge regression to model #1.

In [198]:
# Set up a list of Lasso alphas to check.
l_alphas = np.logspace(-3, 0, 100)

# Cross-validate over our list of Lasso alphas.
lasso_model = LassoCV(alphas=l_alphas, cv=5, max_iter=5000)

# Fit model using best Lasso alpha
lasso_model = lasso_model.fit(X_train2_sc, y_train2)

In [199]:
# Here is the optimal value of alpha
lasso_model.alpha_

1.0

In [200]:
print(lasso_model.score(X_train2_sc, y_train2))

0.8471066705137267


In [201]:
lasso_model.coef_

array([ 60045.95244198,  58511.87752413, -56460.54723518,   5534.10728502,
       -22440.30811877,   3259.202792  ,   -511.59092847,   5996.73063901,
         5191.23287338,   4421.3798131 ,   4734.97553988,    272.10278336,
          502.06981351,   3684.20906427,   5172.80204361,   4417.95021099,
         7476.03335328,   -116.60544614,   2317.03345153,   5193.66835051])

# Kaggle Submission #1: Model 2

In [105]:
# Create a dataframe of our features from our testing data
X_kaggle = df_test[features]

In [106]:
# Apply standard scalar to kaggle test data
X_test_Kaggle_sc = ss.transform(X_kaggle)

In [107]:
# Make predictions and save those predictions to a new column
X_kaggle['saleprice'] = lr.predict(X_test_Kaggle_sc)
X_kaggle.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,overall qual,exter qual,gr liv area,kitchen qual,garage area,garage cars,total bsmt sf,1st flr sf,bsmt qual,year built,saleprice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2658,6,3,1928,2,440,1,1020,908,2,1910,138876.358784
2718,5,3,1967,3,580,2,1967,1967,4,1977,207404.624336
2414,7,4,1496,4,426,2,654,664,4,2006,201680.618718
1989,5,4,968,3,480,2,968,968,3,1923,132298.606142
625,6,3,1394,3,514,2,1394,1394,4,1963,175224.888234


In [108]:
# Create a new dataframe of JUST our predictions
output = X_kaggle[['saleprice']]
output.head()

Unnamed: 0_level_0,saleprice
Id,Unnamed: 1_level_1
2658,138876.358784
2718,207404.624336
2414,201680.618718
1989,132298.606142
625,175224.888234


In [109]:
# Saving our predictions to our datasets folder
output.to_csv("../data/third_submission.csv")

# Kaggle Submission #2: Model 1

In [202]:
# Create a dataframe of our features from our testing data
X_kaggle2 = df_test[features2]

In [203]:
X_kaggle2.head()

Unnamed: 0_level_0,overall qual gr liv area,overall qual garage area,gr liv area garage area,gr liv area,overall qual,garage cars,total bsmt sf,1st flr sf,bsmt qual,year built,year remod/add,full bath,totrms abvgrd,mas vnr area,fireplaces,heating qc,bsmtfin sf 1,open porch sf,wood deck sf,lot area
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2658,11568.0,2640.0,848320.0,1928,6,1,1020,908,2,1910,1950,2,9,0.0,0,4,0,60,0,9142
2718,9835.0,2900.0,1140860.0,1967,5,2,1967,1967,4,1977,1977,2,10,0.0,0,3,0,0,170,9662
2414,10472.0,2982.0,637296.0,1496,7,2,654,664,4,2006,2006,2,7,0.0,1,5,554,24,100,17104
1989,4840.0,2400.0,464640.0,968,5,2,968,968,3,1923,2006,1,5,0.0,0,3,0,0,0,8520
625,8364.0,3084.0,716516.0,1394,6,2,1394,1394,4,1963,1963,1,6,247.0,2,4,609,76,0,9500


In [204]:
# Apply standard scalar to kaggle test data
X_kaggle2_sc = ss.transform(X_kaggle2)

In [205]:
# Make predictions and save those predictions to a new column
X_kaggle2['saleprice'] = lr.predict(X_kaggle2_sc)
X_kaggle2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,overall qual gr liv area,overall qual garage area,gr liv area garage area,gr liv area,overall qual,garage cars,total bsmt sf,1st flr sf,bsmt qual,year built,...,full bath,totrms abvgrd,mas vnr area,fireplaces,heating qc,bsmtfin sf 1,open porch sf,wood deck sf,lot area,saleprice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2658,11568.0,2640.0,848320.0,1928,6,1,1020,908,2,1910,...,2,9,0.0,0,4,0,60,0,9142,141639.412659
2718,9835.0,2900.0,1140860.0,1967,5,2,1967,1967,4,1977,...,2,10,0.0,0,3,0,0,170,9662,161028.17883
2414,10472.0,2982.0,637296.0,1496,7,2,654,664,4,2006,...,2,7,0.0,1,5,554,24,100,17104,208122.76547
1989,4840.0,2400.0,464640.0,968,5,2,968,968,3,1923,...,1,5,0.0,0,3,0,0,0,8520,115531.096444
625,8364.0,3084.0,716516.0,1394,6,2,1394,1394,4,1963,...,1,6,247.0,2,4,609,76,0,9500,187536.623641


In [206]:
# Create a new dataframe of JUST our predictions
output2 = X_kaggle2[['saleprice']]
output2.head()

Unnamed: 0_level_0,saleprice
Id,Unnamed: 1_level_1
2658,141639.412659
2718,161028.17883
2414,208122.76547
1989,115531.096444
625,187536.623641


In [207]:
# Saving our predictions to our datasets folder
output2.to_csv("../data/second_submission.csv")