In [54]:
# Importing necessary packages
import pandas as pd # python's data handling package
import numpy as np # python's scientific computing package
import matplotlib.pyplot as plt # python's plotting package
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [55]:
# Both features and target have already been scaled: mean = 0; SD = 1
data = pd.read_csv('Houseprice_data_scaled.csv') 
raw = pd.read_csv('original data.csv')

### Base Case: No Additional Features

In [56]:
# First 1800 data items are training set; the next 600 are the validation set
train = data.iloc[:1800] 
val = data.iloc[1800:2400]

In [57]:
# Creating the "X" and "y" variables. We drop sale price from "X"
X_train, X_val = train.drop('Sale Price', axis=1), val.drop('Sale Price', axis=1)
y_train, y_val = train[['Sale Price']], val[['Sale Price']] 

##### Ridge Regression

In [58]:
# The alpha used by Python's ridge should be the lambda in Hull's book times the number of observations
alphas=[0.01*1800, 0.02*1800, 0.03*1800, 0.04*1800, 0.05*1800, 0.075*1800,0.1*1800,0.2*1800, 0.4*1800]
mses=[]
for alpha in alphas:
    ridge=Ridge(alpha=alpha)
    ridge.fit(X_train,y_train)
    pred=ridge.predict(X_val)
    mses.append(mse(y_val,pred))
    print(mse(y_val,pred))

0.11703284346091346
0.11710797319752994
0.11723952924901127
0.11741457158889525
0.11762384068711469
0.11825709631198024
0.11900057469147929
0.12254649996292954
0.13073599680747133


##### Lasso Regression

In [59]:
# We now consider different lambda values. The alphas are half the lambdas
alphas=[0.01/2, 0.02/2, 0.03/2, 0.04/2, 0.05/2, 0.075/2, 0.1/2]
mses=[]
for alpha in alphas:
    lasso=Lasso(alpha=alpha)
    lasso.fit(X_train,y_train)
    pred=lasso.predict(X_val)
    mses.append(mse(y_val,pred))
    print(mse(y_val, pred))

0.11654751909608793
0.11682687945311092
0.11803348353132027
0.12012836764958999
0.12301536903084047
0.13178576395045638
0.14017194584483775


In [60]:
coeffs = pd.DataFrame(
    [
        ['intercept'] + list(X_train.columns),
        list(lasso.intercept_) + list(lasso.coef_)
    ]
).transpose().set_index(0)
coeffs

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
intercept,-1.25303e-11
LotArea,0.0443042
OverallQual,0.298079
OverallCond,0.0
YearBuilt,0.0520907
YearRemodAdd,0.0644712
BsmtFinSF1,0.115875
BsmtUnfSF,-0.0
TotalBsmtSF,0.10312
1stFlrSF,0.0322946


### Step1. Add LotFrontage 

The missing values of LotFrontage are imputed. Three imputation methods are considered: median, KNN, and MICE

#### 1. Median Imputation  
If there are many missing values, we may end up building a biased model that lacks precision. This can lead to incorrect results if the missing values are not handled properly. As the summary statistics shown below, the standard deviation of LotFrontage is relatively large and there exists some outliers (max: 313 vs. min: 21). Imputation using median values can fill the missing values while avoiding any potential outlier effects.

In [34]:
raw['LotFrontage'].describe()

count    2422.000000
mean       69.083815
std        22.691072
min        21.000000
25%        59.000000
50%        68.000000
75%        80.000000
max       313.000000
Name: LotFrontage, dtype: float64

In [35]:
# Replace NA with median 
raw_m = raw['LotFrontage'].fillna(raw['LotFrontage'].median()) 

# Z-Scale
raw_m = (raw_m - raw_m.mean()) / raw_m.std()

In [36]:
# First 1800 data items are training set; the next 600 are the validation set
data['LotFrontage (median)'] = raw_m
train_m = data.iloc[:1800] 
val_m = data.iloc[1800:2400]

In [37]:
# Creating the "X" and "y" variables. We drop sale price from "X"
X_tm, X_vm = train_m.drop('Sale Price', axis=1), val_m.drop('Sale Price', axis=1)
y_tm, y_vm = train_m[['Sale Price']], val_m[['Sale Price']] 

##### Ridge Regression

In [38]:
# The alpha used by Python's ridge should be the lambda in Hull's book times the number of observations
alphas=[0.01*1800, 0.02*1800, 0.03*1800, 0.04*1800, 0.05*1800, 0.075*1800,0.1*1800,0.2*1800, 0.4*1800]
mses=[]
for alpha in alphas:
    ridge=Ridge(alpha=alpha)
    ridge.fit(X_tm,y_tm)
    pred_m =ridge.predict(X_vm)
    mses.append(mse(y_vm,pred_m))
    print(mse(y_vm,pred_m))

0.1132204590229198
0.11330538984245145
0.11344725488406335
0.1136323267083885
0.11385103489053112
0.1145047726400233
0.11526451677314284
0.11884784889045473
0.12705432530764124


###### Lasso with different levels of alpha and its mse

In [39]:
# We now consider different lambda values. The alphas are half the lambdas
alphas=[0.01/2, 0.02/2, 0.03/2, 0.04/2, 0.05/2, 0.075/2, 0.1/2]
mses=[]
for alpha in alphas:
    lasso=Lasso(alpha=alpha)
    lasso.fit(X_tm,y_tm)
    pred_m=lasso.predict(X_vm)
    mses.append(mse(y_vm,pred_m))
    print(mse(y_vm, pred_m))

0.11277532082060478
0.11275024517969896
0.11401957020270513
0.11640507064421318
0.11964967018590243
0.1297212950846984
0.1387105562903177


#### 2. KNNImputer 
Imputation of Missing Value Using sci-kit learn Library (Multivaraite Approach). The missing values might be correlated with some other features such as Neighborhood and YearBuilt. We can impute the missing values by finding non-missing ones that are similar in terms of those features using the k-Nearest Neighbors approach where a Euclidean distance is used to find the nearest neighbors.

In [3]:
test = raw.copy()
NA = test[test['LotFrontage'].isnull()]
V = test[test['LotFrontage'].notnull()]
NA = NA.astype('category').apply(lambda x: x.cat.codes) 
V = V.astype('category').apply(lambda x: x.cat.codes) 
Diff = abs(V.mean()-NA.mean()).to_frame()
Diff = Diff[Diff[0]>2]
Large = Diff[2:-1]

In [4]:
frames = [V, NA]
result = pd.concat(frames)
Large['std']= result[Diff[2:].index].std().to_frame()
Large

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,0,std
LotArea,511.724762,448.983557
Neighborhood,2.394717,6.015492
YearBuilt,24.453706,30.291424
YearRemodAdd,2.056191,20.875788
Exterior1st,3.86512,3.448812
Exterior2nd,3.771576,3.757949
MasVnrArea,26.363593,95.220497
BsmtFinSF1,139.753346,256.935192
BsmtFinSF2,7.077873,37.196082
BsmtUnfSF,269.052716,289.524532


In [5]:
Diff[2:-1].index # excluding ID, LotFrontage, and SalesPrice

Index(['LotArea', 'Neighborhood', 'YearBuilt', 'YearRemodAdd', 'Exterior1st',
       'Exterior2nd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageYrBlt',
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
       'ScreenPorch'],
      dtype='object')

In [15]:
x_knn = raw[Diff[2:-1].index]
x_knn = x_knn.astype('category').apply(lambda x: x.cat.codes)
x_knn['LotFrontage']=raw['LotFrontage']
x_knn.head()
b=x_knn.columns

In [16]:
scaler = StandardScaler()
scaler.fit(x_knn)

x_knn = pd.DataFrame(scaler.transform(x_knn),columns=b)


In [45]:
from sklearn.impute import KNNImputer
impute_knn = KNNImputer(n_neighbors=2, weights = 'uniform')
z = impute_knn.fit_transform(x_knn)
raw_knn = pd.DataFrame(z[:,20:],columns=['LotFrontage'])


In [46]:
# First 1800 data items are training set; the next 600 are the validation set
data['LotFrontage (knn)'] = raw_knn
data_KNN=data.copy().drop(columns=['LotFrontage (median)'])
train = data_KNN.iloc[:1800] 
val = data_KNN.iloc[1800:2400]

In [50]:
# Creating the "X" and "y" variables. We drop sale price from "X"
X_train, X_val = train.drop(['Sale Price'], axis=1), val.drop(['Sale Price'], axis=1)
y_train, y_val = train[['Sale Price']], val[['Sale Price']] 

###### Ridge Regression

In [51]:
# The alpha used by Python's ridge should be the lambda in Hull's book times the number of observations
alphas=[0.01*1800, 0.02*1800, 0.03*1800, 0.04*1800, 0.05*1800, 0.075*1800,0.1*1800,0.2*1800, 0.4*1800]
mses=[]
for alpha in alphas:
    ridge=Ridge(alpha=alpha)
    ridge.fit(X_train,y_train)
    pred=ridge.predict(X_val)
    mses.append(mse(y_val,pred))
    print(mse(y_val,pred))

0.11409311915479867
0.1141674265595917
0.1143014941217504
0.11448074617414193
0.11469509322084899
0.11534223670388641
0.11609921220047241
0.11968606330460735
0.12791147040521872


In [53]:
# DataFrame with corresponding feature and its respective coefficients
coeffs = pd.DataFrame(
    [
        ['intercept'] + list(X_train.columns),
        list(lasso.intercept_) + list(lasso.coef_)
    ]
).transpose().set_index(0)
coeffs

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
intercept,-0.000557524
LotArea,0.0412715
OverallQual,0.298858
OverallCond,0.0
YearBuilt,0.0521873
YearRemodAdd,0.0656908
BsmtFinSF1,0.116139
BsmtUnfSF,-0.0
TotalBsmtSF,0.10346
1stFlrSF,0.0284239


###### Lasso with different levels of alpha and its mse

In [52]:
# We now consider different lambda values. The alphas are half the lambdas
alphas=[0.01/2, 0.02/2, 0.03/2, 0.04/2, 0.05/2, 0.075/2, 0.1/2]
mses=[]
for alpha in alphas:
    lasso=Lasso(alpha=alpha)
    lasso.fit(X_train,y_train)
    pred=lasso.predict(X_val)
    mses.append(mse(y_val,pred))
    print(mse(y_val, pred))

0.11377211331984985
0.11375829649921634
0.11499889040455556
0.11730909522593123
0.12045246966091486
0.13031370245908774
0.13900000256105585


### Multivariate Imputation via Chained Equations

In [176]:
##use the scale dataset
from sklearn import impute
from sklearn import experimental
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import preprocessing

data_1 = pd.read_csv('Houseprice_data_scaled.csv') 
data_1['LotFrontage']=raw['LotFrontage'].copy()
data_new=data_1.drop(columns=['Sale Price'])
lr=LinearRegression()
imp = IterativeImputer(estimator=lr,max_iter=10, verbose=2)
a=imp.fit_transform(data_new)

b= pd.DataFrame(preprocessing.scale(a[:,47:]),columns=['LotFrontage_MICE'])
data_1['LotFrontage_MICE']=b
data_1=data_1.drop(columns=['LotFrontage'])


[IterativeImputer] Completing matrix with shape (2908, 48)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.19
[IterativeImputer] Change: 142.21984863281241, scaled tolerance: 0.313 
[IterativeImputer] Ending imputation round 2/10, elapsed time 0.40
[IterativeImputer] Change: 0.0, scaled tolerance: 0.313 
[IterativeImputer] Early stopping criterion reached.


In [167]:
##use the orignal whole dataset (skip this DO NOT RUN THIS CODE)
from sklearn import impute
from sklearn import experimental
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import preprocessing

raw_dummy=pd.get_dummies(raw).drop(columns=['SalePrice'])
lr=LinearRegression()
imp = IterativeImputer(estimator=lr,max_iter=10, verbose=2)
a=imp.fit_transform(raw_dummy)
b=raw_dummy.columns

LotFrontage=pd.DataFrame(a, columns=b)['LotFrontage']
data_1 = pd.read_csv('Houseprice_data_scaled.csv') 
data_1['LotFrontage_mice']=preprocessing.scale(LotFrontage)


[IterativeImputer] Completing matrix with shape (2908, 288)
[IterativeImputer] Ending imputation round 1/10, elapsed time 9.30
[IterativeImputer] Change: 353.9343001530891, scaled tolerance: 215.245 
[IterativeImputer] Ending imputation round 2/10, elapsed time 18.82
[IterativeImputer] Change: 12.318124223313362, scaled tolerance: 215.245 
[IterativeImputer] Early stopping criterion reached.


In [178]:
train_1 = data_1.iloc[:1800] 
val_1 = data_1.iloc[1800:2400]
X_train_1, X_val_1 = train_1.drop('Sale Price', axis=1), val_1.drop('Sale Price', axis=1)
y_train_1, y_val_1 = train_1[['Sale Price']], val_1[['Sale Price']] 

###### Ridge Regression

In [179]:
# The alpha used by Python's ridge should be the lambda in Hull's book times the number of observations
alphas=[0.01*1800, 0.02*1800, 0.03*1800, 0.04*1800, 0.05*1800, 0.075*1800,0.1*1800,0.2*1800, 0.4*1800]
mses=[]
for alpha in alphas:
    ridge=Ridge(alpha=alpha)
    ridge.fit(X_train_1,y_train_1)
    
    pred_1=ridge.predict(X_val_1)
    mses.append(mse(y_val_1,pred_1))
  
    print(mse(y_val_1,pred_1))

0.11487572795890695
0.11494458830014738
0.11507143731774153
0.11524237103854401
0.11544776060849138
0.11607149015517529
0.11680533347506952
0.12031391055760328
0.12844479230111244


###### Lasso with different levels of alpha and its mse

In [180]:
# We now consider different lambda values. The alphas are half the lambdas
alphas=[0.01/2, 0.02/2, 0.03/2, 0.04/2, 0.05/2, 0.075/2, 0.1/2]
mses=[]
for alpha in alphas:
    lasso=Lasso(alpha=alpha)
    lasso.fit(X_train_1,y_train_1)
    pred_1=lasso.predict(X_val_1)
    mses.append(mse(y_val_1,pred_1))
    print(mse(y_val_1, pred_1))

0.11453446505050062
0.11458589205103749
0.11580374376276623
0.11808905753166438
0.12125453902349155
0.13095791340827423
0.13959602445739805


##### lasso with alphas=0.02/2 is chosen in further prediction, and KNN is used

# Improvement after adding LotFrontage

#### Base model (using testing dataset to compare)

In [183]:
data = pd.read_csv('Houseprice_data_scaled.csv') 
raw = pd.read_csv('original data.csv')
# First 1800 data items are training set; the next 600 are the validation set
train = data.iloc[:1800] 
val = data.iloc[1800:2400]
test=data.iloc[2400:]
# Creating the "X" and "y" variables. We drop sale price from "X"
X_train, X_val, X_test = train.drop('Sale Price', axis=1), val.drop('Sale Price', axis=1), test.drop('Sale Price', axis=1)
y_train, y_val, y_test = train[['Sale Price']], val[['Sale Price']] ,test[['Sale Price']] 


alphas= 0.01
lasso=Lasso(alpha=alpha)
lasso.fit(X_train,y_train)
pred=lasso.predict(X_test)
mse(y_test,pred)

0.14720538902033128

#### Add LotFrontage using KNN (may need to change in future)

In [62]:
train_1 = data_KNN.iloc[:1800] 
val_1 = data_KNN.iloc[1800:2400]
test_1=data_KNN.iloc[2400:]
# Creating the "X" and "y" variables. We drop sale price from "X"
X_train_1, X_val_1, X_test_1 = train_1.drop('Sale Price', axis=1), val_1.drop('Sale Price', axis=1), test_1.drop('Sale Price', axis=1)
y_train_1, y_val_1, y_test_1 = train_1[['Sale Price']], val_1[['Sale Price']] ,test_1[['Sale Price']] 

alphas= 0.01
lasso=Lasso(alpha=alpha)
lasso.fit(X_train_1,y_train_1)
pred=lasso.predict(X_test_1)
print("the MSE for the model is",mse(y_test_1,pred),",the R^2 for train dataset is ", lasso.score(X_train_1,y_train_1), ", and the R^2 for testing dataset is ",lasso.score(X_test_1,y_test_1))


the MSE for the model is 0.14684530505332963 ,the R^2 for train dataset is  0.855695751959162 , and the R^2 for testing dataset is  0.8490172253573256


# Adding Lotshape 

In [71]:
from sklearn import preprocessing
lotshape=pd.get_dummies(raw.LotShape,drop_first=True)
standard_lotshape = pd.DataFrame(preprocessing.scale(lotshape),columns=lotshape.columns)

data_KNN[['LotShape IR2','LotShape IR3','LotShape Reg']]=standard_lotshape


In [114]:
train_2 = data_KNN.iloc[:1800] 
val_2 = data_KNN.iloc[1800:2400]
test_2=data_KNN.iloc[2400:]
# Creating the "X" and "y" variables. We drop sale price from "X"
X_train_2, X_val_2, X_test_2 = train_2.drop('Sale Price', axis=1), val_2.drop('Sale Price', axis=1), test_2.drop('Sale Price', axis=1)
y_train_2, y_val_2, y_test_2 = train_2[['Sale Price']], val_2[['Sale Price']] ,test_2[['Sale Price']] 


alphas= 0.01
lasso=Lasso(alpha=alpha)
lasso.fit(X_train_2,y_train_2)
pred_2=lasso.predict(X_test_2)
coeffs = pd.DataFrame(
    [
        ['intercept'] + list(X_train_2.columns),
        list(lasso.intercept_) + list(lasso.coef_)
    ]
).transpose().set_index(0)


print( "the MSE for the model is",mse(y_test_2,pred_2),",the R^2 for train dataset is ", lasso.score(X_train_2,y_train_2), ", and the R^2 for testing dataset is ",lasso.score(X_test_2,y_test_2),coeffs)

the MSE for the model is 0.14684530505332963 ,the R^2 for train dataset is  0.855695751959162 , and the R^2 for testing dataset is  0.8490172253573256


#### Lot shape is penalized and does not improve the model

# ADD third and forth variable

In [81]:
raw.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [126]:
from sklearn import preprocessing
for column in raw[['YrSold', 'CentralAir','SaleCondition','Functional', 'BldgType','HouseStyle','RoofStyle','ExterQual']]:
    dt=data_KNN.copy()
    a=raw[[column]].copy()
    a_dummy=pd.get_dummies(a,drop_first=True)
    a_dummy_df = pd.DataFrame(preprocessing.scale(a_dummy),columns=a_dummy.columns)
    dt[list(a_dummy_df.columns)] = a_dummy_df
    train_2 = dt.iloc[:1800] 
    val_2 = dt.iloc[1800:2400]
    test_2=dt.iloc[2400:]
# Creating the "X" and "y" variables. We drop sale price from "X"
    X_train_2, X_val_2, X_test_2 = train_2.drop('Sale Price', axis=1), val_2.drop('Sale Price', axis=1), test_2.drop('Sale Price', axis=1)
    y_train_2, y_val_2, y_test_2 = train_2[['Sale Price']], val_2[['Sale Price']] ,test_2[['Sale Price']] 


    alphas= 0.01
    lasso=Lasso(alpha=alpha)
    lasso.fit(X_train_2,y_train_2)
    pred_2=lasso.predict(X_val_2)
    coeffs = pd.DataFrame(
        [
            ['intercept'] + list(X_train_2.columns),
            list(lasso.intercept_) + list(lasso.coef_)
        ]
    ).transpose().set_index(0)

   # print( column, "the MSE for the model is",mse(y_test_2,pred_2),",the R^2 for train dataset is ", lasso.score(X_train_2,y_train_2), ", and the R^2 for testing dataset is ",lasso.score(X_test_2,y_test_2),coeffs)
    print( column, "the MSE for the model is",mse(y_val_2,pred_2),",the R^2 for train dataset is ", lasso.score(X_train_2,y_train_2), ", and the R^2 for testing dataset is ",lasso.score(X_val_2,y_val_2))    

    
    

YrSold the MSE for the model is 0.13900000256105585 ,the R^2 for train dataset is  0.855695751959162 , and the R^2 for testing dataset is  0.8760251916795697
CentralAir the MSE for the model is 0.13900000256105585 ,the R^2 for train dataset is  0.855695751959162 , and the R^2 for testing dataset is  0.8760251916795697
SaleCondition the MSE for the model is 0.13650995463634133 ,the R^2 for train dataset is  0.8608976308663525 , and the R^2 for testing dataset is  0.8782460780715651
Functional the MSE for the model is 0.13900000256105585 ,the R^2 for train dataset is  0.855695751959162 , and the R^2 for testing dataset is  0.8760251916795697
BldgType the MSE for the model is 0.13900045001731443 ,the R^2 for train dataset is  0.8556958230596916 , and the R^2 for testing dataset is  0.8760247925910598
HouseStyle the MSE for the model is 0.13900000256105585 ,the R^2 for train dataset is  0.855695751959162 , and the R^2 for testing dataset is  0.8760251916795697
RoofStyle the MSE for the mod

#### choose SaleCondition as the third variable 

In [127]:
from sklearn import preprocessing
dt=data_KNN.copy()
a=raw[['SaleCondition']].copy()
a_dummy=pd.get_dummies(a,drop_first=True)
a_dummy_df = pd.DataFrame(preprocessing.scale(a_dummy),columns=a_dummy.columns)
dt[list(a_dummy_df.columns)] = a_dummy_df
data_3=dt.copy()

for column in raw[['YrSold', 'CentralAir','SaleCondition','Functional', 'HouseStyle','RoofStyle','ExterQual']]:
    dt=data_3.copy()
    a=raw[[column]].copy()
    a_dummy=pd.get_dummies(a,drop_first=True)
    a_dummy_df = pd.DataFrame(preprocessing.scale(a_dummy),columns=a_dummy.columns)
    dt[list(a_dummy_df.columns)] = a_dummy_df
    train_2 = dt.iloc[:1800] 
    val_2 = dt.iloc[1800:2400]
    test_2=dt.iloc[2400:]
# Creating the "X" and "y" variables. We drop sale price from "X"
    X_train_2, X_val_2, X_test_2 = train_2.drop('Sale Price', axis=1), val_2.drop('Sale Price', axis=1), test_2.drop('Sale Price', axis=1)
    y_train_2, y_val_2, y_test_2 = train_2[['Sale Price']], val_2[['Sale Price']] ,test_2[['Sale Price']] 


    alphas= 0.01
    lasso=Lasso(alpha=alpha)
    lasso.fit(X_train_2,y_train_2)
    pred_2=lasso.predict(X_val_2)
    coeffs = pd.DataFrame(
        [
            ['intercept'] + list(X_train_2.columns),
            list(lasso.intercept_) + list(lasso.coef_)
        ]
    ).transpose().set_index(0)

   # print( column, "the MSE for the model is",mse(y_test_2,pred_2),",the R^2 for train dataset is ", lasso.score(X_train_2,y_train_2), ", and the R^2 for testing dataset is ",lasso.score(X_test_2,y_test_2),coeffs)
    print( column, "the MSE for the model is",mse(y_val_2,pred_2),",the R^2 for train dataset is ", lasso.score(X_train_2,y_train_2), ", and the R^2 for testing dataset is ",lasso.score(X_val_2,y_val_2))    

   


YrSold the MSE for the model is 0.13650995463634133 ,the R^2 for train dataset is  0.8608976308663525 , and the R^2 for testing dataset is  0.8782460780715651
CentralAir the MSE for the model is 0.13650995463634133 ,the R^2 for train dataset is  0.8608976308663525 , and the R^2 for testing dataset is  0.8782460780715651
SaleCondition the MSE for the model is 0.13650995463634133 ,the R^2 for train dataset is  0.8608976308663525 , and the R^2 for testing dataset is  0.8782460780715651
Functional the MSE for the model is 0.13650995463634133 ,the R^2 for train dataset is  0.8608976308663525 , and the R^2 for testing dataset is  0.8782460780715651
HouseStyle the MSE for the model is 0.13650995463634133 ,the R^2 for train dataset is  0.8608976308663525 , and the R^2 for testing dataset is  0.8782460780715651
RoofStyle the MSE for the model is 0.13650995463634133 ,the R^2 for train dataset is  0.8608976308663525 , and the R^2 for testing dataset is  0.8782460780715651
ExterQual the MSE for th

#### choose ExterQual as the 4th variable since it improved the MSE

In [132]:
from sklearn import preprocessing
dt=data_3.copy()
a=raw[['ExterQual']].copy()
a_dummy=pd.get_dummies(a,drop_first=True)
a_dummy_df = pd.DataFrame(preprocessing.scale(a_dummy),columns=a_dummy.columns)
dt[list(a_dummy_df.columns)] = a_dummy_df
data_4=dt.copy()

train_4 = data_4.iloc[:1800] 
val_4 = data_4.iloc[1800:2400]
test_4=data_4.iloc[2400:]
# Creating the "X" and "y" variables. We drop sale price from "X"
X_train_4, X_val_4, X_test_4 = train_4.drop('Sale Price', axis=1), val_4.drop('Sale Price', axis=1), test_4.drop('Sale Price', axis=1)
y_train_4, y_val_4, y_test_4 = train_4[['Sale Price']], val_4[['Sale Price']] ,test_4[['Sale Price']] 


alphas= 0.01
lasso=Lasso(alpha=alpha)
lasso.fit(X_train_4,y_train_4)
pred_4=lasso.predict(X_test_4)
coeffs = pd.DataFrame(
    [
        ['intercept'] + list(X_train_4.columns),
        list(lasso.intercept_) + list(lasso.coef_)
    ]
).transpose().set_index(0)


print( column, "the MSE for the model is",mse(y_test_4,pred_4),",the R^2 for train dataset is ", lasso.score(X_train_4,y_train_4), ", and the R^2 for testing dataset is ",lasso.score(X_test_4,y_test_4),coeffs)    

   

ExterQual the MSE for the model is 0.13779686979443198 ,the R^2 for train dataset is  0.8616447396225886 , and the R^2 for testing dataset is  0.8583206066337431                                 1
0                                
intercept              0.00059662
LotArea                  0.041912
OverallQual              0.287479
OverallCond                     0
YearBuilt               0.0383137
YearRemodAdd            0.0521956
BsmtFinSF1               0.125897
BsmtUnfSF                      -0
TotalBsmtSF             0.0980082
1stFlrSF                0.0269694
2ndFlrSF                        0
GrLivArea                0.294788
FullBath                        0
HalfBath                        0
BedroomAbvGr                   -0
TotRmsAbvGrd                    0
Fireplaces              0.0224275
GarageCars              0.0314963
GarageArea               0.054598
WoodDeckSF             0.00620259
OpenPorchSF                     0
EnclosedPorch                  -0
Blmngtn               

#### Adding additional features ExterQual and SaleCondition improved the MSE and R^2