In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.linear_model import LinearRegression


from sklearn.preprocessing import scale 

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV

In [2]:
house_data = pd.read_csv('house_data_EDA.csv')

In [3]:
house_data.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,0,1,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,0,1,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,0,1,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,0,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,0,1,0,0,0,1,0


In [4]:
# We already created the dummy variables in the EDA section; so, we will not repeat those steps here

In [5]:
# Next let us define the target variable

y = house_data['SalePrice']

print(y)

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64


In [6]:
X = house_data.drop(['SalePrice'], axis=1)

In [7]:
# train/test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [8]:
# Now we check how our results for score are affected by the StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
lm = LinearRegression().fit(X_train_scaled, y_train)

In [10]:
y_train_pred = lm.predict(X_train_scaled)
y_test_pred = lm.predict(X_test_scaled)

In [12]:
# The train set returns a pretty good score (possibly overfits the data)

lm.score(X_train_scaled, y_train)

0.9404825708609836

In [13]:
# However, we got a nonsense score result for the test set. 

lm.score(X_test_scaled, y_test)

-1.529856193088125e+23

In [14]:
# We see that the StandardScaler does not produce sensitive results. 
# We check to see whether the score improves without using a scaler in a cross validation procedure

In [15]:
cv_scores_5 = cross_val_score(lm,X,y,cv=5)

print(cv_scores_5)

print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_5)))

[0.22265829 0.82160148 0.78891663 0.88467709 0.66450577]
Average 5-Fold CV Score: 0.6764718527575286


In [16]:
# Indeed, the score values make much more sense without the scaler. 

In [17]:
# Let us double-check whether we obtain the same results by using pipeline

pipe = make_pipeline(
    StandardScaler(),
    LinearRegression()
)

In [18]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])

In [19]:
y_tr_pred = pipe.predict(X_train)
y_te_pred = pipe.predict(X_test)

In [20]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.9404825708609836, -1.529856193088125e+23)

In [None]:
# Indeed, it is the case 
# In the final section we will resume our work without scaling our features 