# Preprocessing and Modeling

**Import Packages**

In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error

**Read in Clean Data**

In [74]:
df_train = pd.read_csv('../data/train_clean.csv')
df_train.set_index('Id', inplace=True)
df_test = pd.read_csv('../data/test_clean.csv')
df_test.set_index('Id', inplace=True)

In [75]:
# check the shape of df_train
df_train.shape

(2051, 74)

In [76]:
# check the shape of df_test
df_test.shape

(878, 73)

## Model 1: Linear Regression

In [77]:
df_train.head()

Unnamed: 0_level_0,pid,ms subclass,ms zoning,lot frontage,lot area,street,lot shape,land contour,utilities,lot config,...,paved drive,wood deck sf,open porch sf,enclosed porch,3ssn porch,screen porch,mo sold,yr sold,sale type,saleprice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,533352170,60,RL,0.0,13517,Pave,IR1,Lvl,AllPub,CulDSac,...,Y,0,44,0,0,0,3,2010,WD,130500
544,531379050,60,RL,43.0,11492,Pave,IR1,Lvl,AllPub,CulDSac,...,Y,0,74,0,0,0,4,2009,WD,220000
153,535304180,20,RL,68.0,7922,Pave,Reg,Lvl,AllPub,Inside,...,Y,0,52,0,0,0,1,2010,WD,109000
318,916386060,60,RL,73.0,9802,Pave,Reg,Lvl,AllPub,Inside,...,Y,100,0,0,0,0,4,2010,WD,174000
255,906425045,50,RL,82.0,14235,Pave,IR1,Lvl,AllPub,Inside,...,N,0,59,0,0,0,3,2010,WD,138500


In [78]:
# Create X and y for train and test datasets

features = ['overall qual', 'gr liv area', 'garage area']

X_train = df_train[features]
y_train = df_train['saleprice']

In [79]:
# Scale X_train and then use it to transform X_train
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.fit_transform(X_train)

In [80]:
X_train.shape

(2051, 3)

In [81]:
y.shape

(2051,)

In [82]:
# Instantiate our linear regression object: 
lr = LinearRegression()

In [83]:
# Get cross val scores
cross_val_score(lr, X_train_sc, y, cv=5)

array([0.76248544, 0.78530582, 0.72026629, 0.79280991, 0.72505863])

In [88]:
# Fit the model with training data
lr.fit(X_train_sc, y_train)

# Get the training score
lr.score(X_train_sc, y_train)

0.7573468436733898

In [90]:
# Generate predictions for x_test_sc
pred_1 = lr.predict(X_train_sc)

In [91]:
pred_1

array([177457.62902739, 243255.71030163, 110883.60030228, ...,
       187913.05998768,  93455.37234528, 218666.00397533])

In [105]:
# Calculate RMSE 
np.sqrt(mean_squared_error(y, pred_1))

39033.16745832718

**Explanation:** This model produces an RMSE score of 39,033.17 on the training data and 42,733.83 on the testing data. RMSE represents the level of error in units of the predictor, so we want it to be low. In this case, the training dataset is performing better than the test. This indicates the model doesn't generalize well to new, unseen data. How to address this? Regularization!

In [None]:
#Is there a pattern and if so, what is it? The model looks like it predicts quite well for a certain range of sale prices and the errors look homogenous. However, we can also see the error is getting larger as the model tries to make larger predictions (over $250,000).

## Model 2: Linear Regression with Lasso

## Model 3: Linear Regression with Ridge

# Kaggle Submission #1

In [92]:
features = ['overall qual', 'gr liv area', 'garage area']

In [97]:
# Create a dataframe of our features from our testing data
X_kaggle = df_test[features]

In [94]:
# Apply standard scalar to kaggle test data
X_test_Kaggle_sc = ss.transform(X_kaggle)

In [99]:
# Make predictions and save those predictions to a new column
X_kaggle['saleprice'] = lr.predict(X_test_Kaggle_sc)
X_kaggle.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,overall qual,gr liv area,garage area,saleprice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2658,6,1928,440,196373.41417
2718,5,1967,580,181186.324302
2414,7,1496,426,202532.061135
1989,5,968,480,125058.453434
625,6,1394,514,176432.30695


In [100]:
# Create a new dataframe of JUST our predictions
output = X_kaggle[['saleprice']]
output.head()

Unnamed: 0_level_0,saleprice
Id,Unnamed: 1_level_1
2658,196373.41417
2718,181186.324302
2414,202532.061135
1989,125058.453434
625,176432.30695


In [101]:
# Saving our predictions to our datasets folder
output.to_csv("../data/first_submission.csv")