In [86]:
import warnings
warnings.filterwarnings('ignore')

#Import Libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Read the file

In [87]:
house = pd.read_csv("train.csv")
house.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


EDA

In [88]:
house.shape
house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

We see some missing values in the dataset.

In [89]:
(round(100*(house.isnull().sum()/len(house.index)),2).sort_values(ascending=False)).head(20)

PoolQC          99.52
MiscFeature     96.30
Alley           93.77
Fence           80.75
MasVnrType      59.73
FireplaceQu     47.26
LotFrontage     17.74
GarageYrBlt      5.55
GarageCond       5.55
GarageType       5.55
GarageFinish     5.55
GarageQual       5.55
BsmtFinType2     2.60
BsmtExposure     2.60
BsmtQual         2.53
BsmtCond         2.53
BsmtFinType1     2.53
MasVnrArea       0.55
Electrical       0.07
Id               0.00
dtype: float64

PoolQC, MiscFeature, Alley, Fence, MasVnrType, FireplaceQu have a lot of null data. We can drop them safely

In [90]:
house_data = house.drop(['FireplaceQu','MasVnrType','Fence','Alley','MiscFeature','PoolQC', 'Id'], axis=1)

In [91]:
house_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


Price values are very high compared to other fields. Let's log transform it.

In [92]:
house_data['SalePrice'] = np.log(house_data['SalePrice'])

In [93]:
(round(100*(house_data.isnull().sum()/len(house_data.index)),2).sort_values(ascending=False)).head(20)

LotFrontage     17.74
GarageYrBlt      5.55
GarageCond       5.55
GarageType       5.55
GarageFinish     5.55
GarageQual       5.55
BsmtFinType2     2.60
BsmtExposure     2.60
BsmtQual         2.53
BsmtFinType1     2.53
BsmtCond         2.53
MasVnrArea       0.55
Electrical       0.07
TotRmsAbvGrd     0.00
KitchenQual      0.00
KitchenAbvGr     0.00
BedroomAbvGr     0.00
HalfBath         0.00
FullBath         0.00
BsmtHalfBath     0.00
dtype: float64

From the data dictionary, we observe two variables are used to represent age of the building. So, we can derive that info into one.

In [94]:
house_data['Age'] = house_data['YrSold'] - house_data['YearBuilt']
house_data.drop(['YrSold', 'YearBuilt'], axis=1, inplace=True)
house_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,SaleType,SaleCondition,SalePrice,Age
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,WD,Normal,12.247694,5
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,WD,Normal,12.109011,31
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,WD,Normal,12.317167,7
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,WD,Abnorml,11.849398,91
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,WD,Normal,12.429216,8


Lets look into categorical variables

In [95]:
categorical = house_data.select_dtypes(include=['object'])
categorical.columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [96]:
cat_col_names = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'SaleType', 'SaleCondition']
for i in cat_col_names:
    house_data[i].fillna("none", inplace=True)

Now, lets add dummy variables for categorical, add it to the data set and remove original columns.

In [97]:
dummy_cat = pd.get_dummies(categorical,drop_first=True)
house_data = pd.concat([house_data,dummy_cat],axis=1)
house_data.drop(cat_col_names,axis=1,inplace=True)
house_data.dropna(inplace=True)

# Split the data into test, train set.

In [98]:
df_train,df_test = train_test_split(house_data, train_size = 0.7, random_state = 100)
y_train=df_train.pop('SalePrice')
X_train=df_train
y_test=df_test.pop('SalePrice')
X_test=df_test

Now, using MinMaxScaler to scale the data.

In [99]:
scaler=MinMaxScaler()
numeric = X_train.dtypes[X_train.dtypes != "object"].index
X_train[numeric]=scaler.fit_transform(X_train[numeric])
X_test[numeric]=scaler.transform(X_test[numeric])


# Let's perform ridge regression now.

In [100]:
from sklearn.model_selection import GridSearchCV

In [101]:
params = {'alpha': [0.0001, 0.001, 0.01, 0.05,
                    0.1, 0.2, 0.3, 0.4, 0.5,
                    1.0, 2.0, 3.0, 4.0, 5.0,
                    6.0, 7.0, 8.0, 9.0, 10.0,
                    20, 50, 100, 500, 1000 ]}

ridge = Ridge()

# cross validation
folds = 5
model = GridSearchCV(estimator = ridge,
                        param_grid = params,
                        scoring= 'neg_mean_absolute_error',
                        cv = folds,
                        return_train_score=True,
                        verbose = 1)
model.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [102]:
ridge_cv_results = pd.DataFrame(model.cv_results_)
ridge_cv_results = ridge_cv_results[ridge_cv_results['param_alpha']<=500]
ridge_cv_results[['param_alpha', 'mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values(by = ['rank_test_score'])


Unnamed: 0,param_alpha,mean_train_score,mean_test_score,rank_test_score
11,3.0,-0.071772,-0.093858,1
12,4.0,-0.073459,-0.093972,2
10,2.0,-0.069723,-0.094049,3
13,5.0,-0.07492,-0.094307,4
1,0.001,-0.054807,-0.094511,5
2,0.01,-0.055482,-0.094524,6
0,0.0001,-0.05476,-0.094553,7
14,6.0,-0.076268,-0.094755,8
9,1.0,-0.066824,-0.095024,9
15,7.0,-0.077527,-0.095288,10


Best alpha seems to be 3. Fit using that alpha.

In [103]:
alpha = 6
ridge = Ridge(alpha=alpha)

ridge.fit(X_train, y_train)
ridge.score(X_train,y_train)

y_train_pred = ridge.predict(X_train)
y_pred = ridge.predict(X_test)

In [104]:
from sklearn import metrics


print("R-Square of train data =", '%.2f' % r2_score(y_train, y_train_pred))
print("R-Square of test data =", '%.2f' % r2_score(y_test, y_pred))
print("MSE of train data =", '%.2f' % mean_squared_error(y_train, y_train_pred))
print("MSE of test data =", '%.2f' % mean_squared_error(y_test, y_pred))

R-Square of train data = 0.92
R-Square of test data = 0.88
MSE of train data = 0.01
MSE of test data = 0.02


# Lasso Regression

In [105]:
alphas = [0.0001, 0.001, 0.01, 0.05,
                    0.1, 0.2, 0.3, 0.4, 0.5,
                    1.0, 2.0, 3.0, 4.0, 5.0,
                    6.0, 7.0, 8.0, 9.0, 10.0,
                    20, 50, 100, 500, 1000 ]
lasso = LassoCV(alphas=alphas, cv=5)  # 5-fold cross-validation
lasso.fit(X_train, y_train)

In [106]:
lasso.alpha_

0.001

Best alpha for Lasso is 0.001

In [107]:
y_train_pred = lasso.predict(X_train)
y_pred = lasso.predict(X_test)

print("R-Square of train data =", '%.2f' % r2_score(y_train, y_train_pred))
print("R-Square of test data =", '%.2f' % r2_score(y_test, y_pred))
print("MSE of train data =", '%.2f' % mean_squared_error(y_train, y_train_pred))
print("MSE of test data =", '%.2f' % mean_squared_error(y_test, y_pred))

R-Square of train data = 0.90
R-Square of test data = 0.88
MSE of train data = 0.01
MSE of test data = 0.02


It is clear from the metrics that Lasso and Ridge performs very similar on the test data.

In [108]:
features = pd.DataFrame(index=X_train.columns)
features.rows = X_train.columns
features['Ridge'] = ridge.coef_
features['Lasso'] = lasso.coef_

features

Unnamed: 0,Ridge,Lasso
MSSubClass,-0.062300,-0.071587
LotFrontage,-0.013390,-0.000000
LotArea,0.059930,0.000000
OverallQual,0.267711,0.563113
OverallCond,0.148093,0.166312
...,...,...
SaleCondition_AdjLand,0.029002,0.000000
SaleCondition_Alloca,-0.016948,-0.000000
SaleCondition_Family,0.015559,-0.000000
SaleCondition_Normal,0.057989,0.046448


In [109]:
ridge_coefs = features['Ridge'].sort_values(ascending=False)
print("=====Ridge====")
print(ridge_coefs.head(10))
lasso_coefs = features['Lasso'].sort_values(ascending=False)
print("=====Lasso====")
print(lasso_coefs.head(10))

=====Ridge====
OverallQual             0.267711
TotRmsAbvGrd            0.165807
GarageCars              0.164700
GrLivArea               0.156031
OverallCond             0.148093
1stFlrSF                0.143651
FullBath                0.135031
Neighborhood_Crawfor    0.131167
Fireplaces              0.115313
2ndFlrSF                0.113643
Name: Ridge, dtype: float64
=====Lasso====
GrLivArea               0.569924
OverallQual             0.563113
GarageCars              0.217409
TotRmsAbvGrd            0.170288
OverallCond             0.166312
FullBath                0.157586
Neighborhood_Crawfor    0.134434
Neighborhood_NridgHt    0.110435
Neighborhood_StoneBr    0.098168
Fireplaces              0.095445
Name: Lasso, dtype: float64


In [110]:
# prompt: Now regenerate lasso and ridge using alphas 0.002 and 6 respectively

# Lasso Regression
lasso = Lasso(alpha=0.002)  # 5-fold cross-validation
lasso.fit(X_train, y_train)

# Ridge Regression
ridge = Ridge(alpha=6)
ridge.fit(X_train, y_train)
ridge.score(X_train,y_train)

features['RidgeNew'] = ridge.coef_
features['LassoNew'] = lasso.coef_
ridge_coefs = features['RidgeNew'].sort_values(ascending=False)
print("=====RidgeNew====")
print(ridge_coefs.head(10))
lasso_coefs = features['LassoNew'].sort_values(ascending=False)
print("=====LassoNew====")
print(lasso_coefs.head(10))


=====RidgeNew====
OverallQual             0.267711
TotRmsAbvGrd            0.165807
GarageCars              0.164700
GrLivArea               0.156031
OverallCond             0.148093
1stFlrSF                0.143651
FullBath                0.135031
Neighborhood_Crawfor    0.131167
Fireplaces              0.115313
2ndFlrSF                0.113643
Name: RidgeNew, dtype: float64
=====LassoNew====
OverallQual             0.644553
GrLivArea               0.440108
GarageCars              0.219438
TotRmsAbvGrd            0.195550
Fireplaces              0.119018
OverallCond             0.118670
FullBath                0.116617
Neighborhood_Crawfor    0.102532
Neighborhood_NridgHt    0.088842
YearRemodAdd            0.079009
Name: LassoNew, dtype: float64


# Now dropping top 5 columns of Lasso.

In [111]:
drop_cols = ['OverallQual','GrLivArea','GarageCars','TotRmsAbvGrd','Fireplaces']
X_train.drop(labels = drop_cols, axis = 1, inplace=True)
X_test.drop(labels = drop_cols, axis = 1, inplace=True)
lasso_dropped = Lasso(alpha=0.0001)
lasso_dropped.fit(X_train,y_train)
y_train_pred_dropped = lasso_dropped.predict(X_train)
y_test_pred_dropped = lasso_dropped.predict(X_test)
lasso_coef_dropped_df = pd.DataFrame(lasso_dropped.coef_ , columns = ['Coefficient'], index =  X_train.columns)
print("Top features post dropping 5 features are:\n")
print(lasso_coef_dropped_df.sort_values(by = 'Coefficient', ascending = False).head(5))

Top features post dropping 5 features are:

                  Coefficient
RoofMatl_Membran     2.083879
RoofMatl_WdShngl     1.867802
RoofMatl_CompShg     1.819287
RoofMatl_Tar&Grv     1.663115
RoofMatl_WdShake     1.437711
