# Preprocessing and Modeling

**Import Packages**

In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

**Read in Clean Data**

In [4]:
df_train = pd.read_csv('../data/train_clean.csv')
df_train.set_index('Id', inplace=True)
df_test = pd.read_csv('../data/test_clean.csv')
df_test.set_index('Id', inplace=True)

In [5]:
# check the shape of df_train
df_train.shape

(2051, 74)

In [6]:
# check the shape of df_test
df_test.shape

(878, 73)

## Model 1: Linear Regression

In [7]:
df_train.head()

Unnamed: 0_level_0,pid,ms subclass,ms zoning,lot frontage,lot area,street,lot shape,land contour,utilities,lot config,...,paved drive,wood deck sf,open porch sf,enclosed porch,3ssn porch,screen porch,mo sold,yr sold,sale type,saleprice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,533352170,60,RL,0.0,13517,Pave,IR1,Lvl,AllPub,CulDSac,...,Y,0,44,0,0,0,3,2010,WD,130500
544,531379050,60,RL,43.0,11492,Pave,IR1,Lvl,AllPub,CulDSac,...,Y,0,74,0,0,0,4,2009,WD,220000
153,535304180,20,RL,68.0,7922,Pave,Reg,Lvl,AllPub,Inside,...,Y,0,52,0,0,0,1,2010,WD,109000
318,916386060,60,RL,73.0,9802,Pave,Reg,Lvl,AllPub,Inside,...,Y,100,0,0,0,0,4,2010,WD,174000
255,906425045,50,RL,82.0,14235,Pave,IR1,Lvl,AllPub,Inside,...,N,0,59,0,0,0,3,2010,WD,138500


In [8]:
# Create X and y for train and test datasets

features = ['overall qual', 'gr liv area', 'garage area']

X_train = df_train[features]
y_train = df_train['saleprice']

In [9]:
# Scale X_train and then use it to transform X_train
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.fit_transform(X_train)

In [10]:
X_train.shape

(2051, 3)

In [13]:
y_train.shape

(2051,)

In [14]:
# Instantiate our linear regression object: 
lr = LinearRegression()

In [15]:
# Get cross val scores
cross_val_score(lr, X_train_sc, y_train, cv=5)

array([0.76248544, 0.78530582, 0.72026629, 0.79280991, 0.72505863])

In [16]:
# Fit the model with training data
lr.fit(X_train_sc, y_train)

# Get the training score
lr.score(X_train_sc, y_train)

0.7573468436733898

In [17]:
# Generate predictions for x_test_sc
pred_1 = lr.predict(X_train_sc)
pred_1

array([177457.62902739, 243255.71030163, 110883.60030228, ...,
       187913.05998768,  93455.37234528, 218666.00397533])

In [19]:
# Calculate RMSE 
np.sqrt(mean_squared_error(y_train, pred_1))

39033.16745832718

**Explanation:** This model produces an RMSE score of 39,033.17 on the training data and 42,733.83 on the testing data. RMSE represents the level of error in units of the predictor, so we want it to be low. In this case, the training dataset is performing better than the test. This indicates the model doesn't generalize well to new, unseen data. How to address this? Regularization!

In [20]:
#Is there a pattern and if so, what is it? The model looks like it predicts quite well for a 
#certain range of sale prices and the errors look homogenous. However, we can also see the error is 
#getting larger as the model tries to make larger predictions (over $250,000).

## Model 2: Linear Regression with Feature Engineering

In [22]:
poly_train2 = pd.read_csv('../data/train_poly.csv')
poly_train2.set_index('Id', inplace=True)

In [23]:
poly_train2.head()

Unnamed: 0_level_0,overall qual,overall cond,gr liv area,garage area,kitchen abvgr,overall qual^2,overall qual overall cond,overall qual gr liv area,overall qual garage area,overall qual kitchen abvgr,...,overall cond gr liv area,overall cond garage area,overall cond kitchen abvgr,gr liv area^2,gr liv area garage area,gr liv area kitchen abvgr,garage area^2,garage area kitchen abvgr,kitchen abvgr^2,saleprice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,6.0,8.0,1479.0,475.0,1.0,36.0,48.0,8874.0,2850.0,6.0,...,11832.0,3800.0,8.0,2187441.0,702525.0,1479.0,225625.0,475.0,1.0,130500
544,7.0,5.0,2122.0,559.0,1.0,49.0,35.0,14854.0,3913.0,7.0,...,10610.0,2795.0,5.0,4502884.0,1186198.0,2122.0,312481.0,559.0,1.0,220000
153,5.0,7.0,1057.0,246.0,1.0,25.0,35.0,5285.0,1230.0,5.0,...,7399.0,1722.0,7.0,1117249.0,260022.0,1057.0,60516.0,246.0,1.0,109000
318,5.0,5.0,1444.0,400.0,1.0,25.0,25.0,7220.0,2000.0,5.0,...,7220.0,2000.0,5.0,2085136.0,577600.0,1444.0,160000.0,400.0,1.0,174000
255,6.0,8.0,1445.0,484.0,1.0,36.0,48.0,8670.0,2904.0,6.0,...,11560.0,3872.0,8.0,2088025.0,699380.0,1445.0,234256.0,484.0,1.0,138500


In [24]:
# Create X and y for train and test datasets

features2 = ['overall qual gr liv area',
            'overall qual garage area',
            'gr liv area garage area',
            'gr liv area',
            'overall cond kitchen abvgr']

X_train2 = poly_train2[features2]
y_train2 = poly_train2['saleprice']

In [25]:
# Scale X_train2

ss.fit(X_train2)
X_train2_sc = ss.fit_transform(X_train2)

In [28]:
X_train2_sc.shape

(2051, 5)

In [29]:
y_train2.shape

(2051,)

In [30]:
# Get cross val scores
cross_val_score(lr, X_train2_sc, y_train2, cv=5)

array([0.79378316, 0.82614526, 0.73844348, 0.82316551, 0.75520997])

In [31]:
# Fit the model with training data
lr.fit(X_train2_sc, y_train2)

# Get the training score
lr.score(X_train2_sc, y_train2)

0.7942875220037198

In [32]:
# Generate predictions for x_test_sc
pred_2 = lr.predict(X_train2_sc)
pred_2

array([171001.11701607, 240759.6977169 , 110557.24484791, ...,
       185860.53263189, 103830.43068663, 215082.42027841])

In [33]:
# Calculate RMSE 
np.sqrt(mean_squared_error(y_train2, pred_2))

35939.42630250821

**Evaluation**:

## Model 3: add Ridge Regression

In [85]:
# Will be using the fit AND transformed Standard Scaler data from model #2
# rename for clarity
overfit_train2_sc = X_train2_sc

In [86]:
# Instantiate.
ridge_model=Ridge(alpha=10)

In [87]:
# Fit.
ridge_model.fit(overfit_train2_sc, y_train2)

# Evaluate model using R2.
print(ridge_model.score(overfit_train2_sc, y_train2))

0.794029327671238


In [89]:
# Set up a list of ridge alphas to check.
# Generates 100 values equally between 0 and 5,
# then converts them to alphas between 10^0 and 10^5.
r_alphas = np.logspace(0, 5, 100)

# Cross-validate over our list of ridge alphas.
ridge_model = RidgeCV(alphas=r_alphas, scoring='r2', cv=5)

# Fit model using best ridge alpha!
ridge_model = ridge_model.fit(overfit_train2_sc, y_train2)

In [90]:
# Here is the optimal value of alpha
ridge_model.alpha_

4.0370172585965545

In [91]:
# Evaluate model using R2.
print(ridge_model.score(overfit_train2_sc, y_train2))

0.7942401989673435


In [96]:
ridge_model.coef_

array([ 42021.21092814,  57024.77300953, -37748.8929573 ,  13675.43127624,
        -1112.59267767])

**Evaluation**:

## Model 3: add Lasso

In [93]:
# Set up a list of Lasso alphas to check.
l_alphas = np.logspace(-3, 0, 100)

# Cross-validate over our list of Lasso alphas.
lasso_model = LassoCV(alphas=l_alphas, cv=5, max_iter=5000)

# Fit model using best Lasso alpha
lasso_model = lasso_model.fit(overfit_train2_sc, y_train2)

In [94]:
# Here is the optimal value of alpha
lasso_model.alpha_

0.001

In [95]:
print(lasso_model.score(overfit_train2_sc, y_train2))

0.7942875220037101


In [97]:
lasso_model.coef_

array([ 41630.18476838,  58475.36731829, -39614.16802851,  14720.32982288,
        -1083.05244728])

# Kaggle Submission #1

In [92]:
features = ['overall qual', 'gr liv area', 'garage area']

In [73]:
# Create a dataframe of our features from our testing data
X_kaggle = df_test[features]

Unnamed: 0_level_0,overall qual,gr liv area,garage area
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2658,6,1928,440
2718,5,1967,580
2414,7,1496,426
1989,5,968,480
625,6,1394,514


In [94]:
# Apply standard scalar to kaggle test data
X_test_Kaggle_sc = ss.transform(X_kaggle)

In [99]:
# Make predictions and save those predictions to a new column
X_kaggle['saleprice'] = lr.predict(X_test_Kaggle_sc)
X_kaggle.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,overall qual,gr liv area,garage area,saleprice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2658,6,1928,440,196373.41417
2718,5,1967,580,181186.324302
2414,7,1496,426,202532.061135
1989,5,968,480,125058.453434
625,6,1394,514,176432.30695


In [100]:
# Create a new dataframe of JUST our predictions
output = X_kaggle[['saleprice']]
output.head()

Unnamed: 0_level_0,saleprice
Id,Unnamed: 1_level_1
2658,196373.41417
2718,181186.324302
2414,202532.061135
1989,125058.453434
625,176432.30695


In [101]:
# Saving our predictions to our datasets folder
output.to_csv("../data/first_submission.csv")

# Kaggle Submission #2

In [65]:
features2 = ['overall qual', 'overall cond', 'gr liv area', 'garage area', 'kitchen abvgr']

In [66]:
# Create a dataframe of our features from our testing data
X_kaggle2 = df_test[features2]

In [67]:
X_kaggle2.head()

Unnamed: 0_level_0,overall qual,overall cond,gr liv area,garage area,kitchen abvgr
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2658,6,8,1928,440,2
2718,5,4,1967,580,2
2414,7,5,1496,426,1
1989,5,6,968,480,1
625,6,5,1394,514,1


In [68]:
# Instantiate PolynomialFeatures object to create all two-way terms.
polynomial_features = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

In [69]:
# Fit and transform our X data.
poly_kaggle2 = polynomial_features.fit_transform(X_kaggle2)

In [70]:
poly_kaggle2.shape

(878, 20)

In [71]:
# Transform into a dataframe
poly_kaggle_df = pd.DataFrame(poly_kaggle2, columns = polynomial_features.get_feature_names(X_kaggle2.columns), index=X_kaggle2.index)
poly_kaggle_df.head()

Unnamed: 0_level_0,overall qual,overall cond,gr liv area,garage area,kitchen abvgr,overall qual^2,overall qual overall cond,overall qual gr liv area,overall qual garage area,overall qual kitchen abvgr,overall cond^2,overall cond gr liv area,overall cond garage area,overall cond kitchen abvgr,gr liv area^2,gr liv area garage area,gr liv area kitchen abvgr,garage area^2,garage area kitchen abvgr,kitchen abvgr^2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2658,6.0,8.0,1928.0,440.0,2.0,36.0,48.0,11568.0,2640.0,12.0,64.0,15424.0,3520.0,16.0,3717184.0,848320.0,3856.0,193600.0,880.0,4.0
2718,5.0,4.0,1967.0,580.0,2.0,25.0,20.0,9835.0,2900.0,10.0,16.0,7868.0,2320.0,8.0,3869089.0,1140860.0,3934.0,336400.0,1160.0,4.0
2414,7.0,5.0,1496.0,426.0,1.0,49.0,35.0,10472.0,2982.0,7.0,25.0,7480.0,2130.0,5.0,2238016.0,637296.0,1496.0,181476.0,426.0,1.0
1989,5.0,6.0,968.0,480.0,1.0,25.0,30.0,4840.0,2400.0,5.0,36.0,5808.0,2880.0,6.0,937024.0,464640.0,968.0,230400.0,480.0,1.0
625,6.0,5.0,1394.0,514.0,1.0,36.0,30.0,8364.0,3084.0,6.0,25.0,6970.0,2570.0,5.0,1943236.0,716516.0,1394.0,264196.0,514.0,1.0


In [74]:
# Drop all rows except:
poly_kaggle_df = poly_kaggle_df[['overall qual gr liv area',
            'overall qual garage area',
            'gr liv area garage area',
            'gr liv area',
            'overall cond kitchen abvgr']]

In [75]:
poly_kaggle_df.head()

Unnamed: 0_level_0,overall qual gr liv area,overall qual garage area,gr liv area garage area,gr liv area,overall cond kitchen abvgr
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2658,11568.0,2640.0,848320.0,1928.0,16.0
2718,9835.0,2900.0,1140860.0,1967.0,8.0
2414,10472.0,2982.0,637296.0,1496.0,5.0
1989,4840.0,2400.0,464640.0,968.0,6.0
625,8364.0,3084.0,716516.0,1394.0,5.0


In [76]:
# Apply standard scalar to kaggle test data
X_poly_kaggle_sc = ss.transform(poly_kaggle_df)

In [77]:
# Make predictions and save those predictions to a new column
poly_kaggle_df['saleprice'] = lr.predict(X_poly_kaggle_sc)
poly_kaggle_df.head()

Unnamed: 0_level_0,overall qual gr liv area,overall qual garage area,gr liv area garage area,gr liv area,overall cond kitchen abvgr,saleprice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2658,11568.0,2640.0,848320.0,1928.0,16.0,184211.350585
2718,9835.0,2900.0,1140860.0,1967.0,8.0,163704.356951
2414,10472.0,2982.0,637296.0,1496.0,5.0,195843.107004
1989,4840.0,2400.0,464640.0,968.0,6.0,126523.130105
625,8364.0,3084.0,716516.0,1394.0,5.0,172612.627406


In [78]:
# Create a new dataframe of JUST our predictions
output2 = poly_kaggle_df[['saleprice']]
output2.head()

Unnamed: 0_level_0,saleprice
Id,Unnamed: 1_level_1
2658,184211.350585
2718,163704.356951
2414,195843.107004
1989,126523.130105
625,172612.627406


In [79]:
# Saving our predictions to our datasets folder
output2.to_csv("../data/second_submission.csv")