# Linear Regression demo project

Dataset: Ames Housing dataset 

Goal: Using Linear Regression model to estimate house price based on collected data

Method: Elastic net regularization


## Imports packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Reading data

In [6]:
df = pd.read_csv("Data/AMES_Final.csv")

We test object

In [7]:
df.head()

Unnamed: 0,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,...,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD,Sale Condition_AdjLand,Sale Condition_Alloca,Sale Condition_Family,Sale Condition_Normal,Sale Condition_Partial
0,141.0,31770,6,5,1960,1960,112.0,639.0,0.0,441.0,...,0,0,0,0,1,0,0,0,1,0
1,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,270.0,...,0,0,0,0,1,0,0,0,1,0
2,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,406.0,...,0,0,0,0,1,0,0,0,1,0
3,93.0,11160,7,5,1968,1968,0.0,1065.0,0.0,1045.0,...,0,0,0,0,1,0,0,0,1,0
4,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,137.0,...,0,0,0,0,1,0,0,0,1,0


Our data has 274 columns.

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2925 entries, 0 to 2924
Columns: 274 entries, Lot Frontage to Sale Condition_Partial
dtypes: float64(11), int64(263)
memory usage: 6.1 MB


## Create X feature and y label

In [9]:
X = df.drop(["SalePrice"], axis = 1)


In [10]:
X

Unnamed: 0,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,...,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD,Sale Condition_AdjLand,Sale Condition_Alloca,Sale Condition_Family,Sale Condition_Normal,Sale Condition_Partial
0,141.000000,31770,6,5,1960,1960,112.0,639.0,0.0,441.0,...,0,0,0,0,1,0,0,0,1,0
1,80.000000,11622,5,6,1961,1961,0.0,468.0,144.0,270.0,...,0,0,0,0,1,0,0,0,1,0
2,81.000000,14267,6,6,1958,1958,108.0,923.0,0.0,406.0,...,0,0,0,0,1,0,0,0,1,0
3,93.000000,11160,7,5,1968,1968,0.0,1065.0,0.0,1045.0,...,0,0,0,0,1,0,0,0,1,0
4,74.000000,13830,5,5,1997,1998,0.0,791.0,0.0,137.0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2920,37.000000,7937,6,6,1984,1984,0.0,819.0,0.0,184.0,...,0,0,0,0,1,0,0,0,1,0
2921,75.144444,8885,5,5,1983,1983,0.0,301.0,324.0,239.0,...,0,0,0,0,1,0,0,0,1,0
2922,62.000000,10441,5,5,1992,1992,0.0,337.0,0.0,575.0,...,0,0,0,0,1,0,0,0,1,0
2923,77.000000,10010,5,5,1974,1975,0.0,1071.0,123.0,195.0,...,0,0,0,0,1,0,0,0,1,0


y label is sale price that we want to extimate

In [11]:
y = df['SalePrice']

In [12]:
y

0       215000
1       105000
2       172000
3       244000
4       189900
         ...  
2920    142500
2921    131000
2922    132000
2923    170000
2924    188000
Name: SalePrice, Length: 2925, dtype: int64

## Train/Test sets split

We split split X and y into a training set and test set. We choose 10% of data for test set.

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state= 97)

## Scale the X features

In [14]:
# scale data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# train data
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Choose a model

We use Elastic net regularization. 

In [15]:
# import and create a model 
from sklearn.linear_model import ElasticNet
elastic_model = ElasticNet()

Create a dictionary parameter grid. The Elastic Net model has two main parameters, alpha and the L1 ratio. 

In [16]:
# alpha: Constant that multiplies the penalty terms
alpha_list = [0.1, 1, 5, 10, 50, 100]
# The ElasticNet mixing parameter, with 0 <= l1_ratio <= 1. 
# For l1_ratio = 0 the penalty is an L2 penalty. 
# For l1_ratio = 1 it is an L1 penalty. 
# For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
l1_ratio_list = [0.1, 0.25, 0.5, 0.75, 0.87, 0.95, 0.99, 1]

parameter_grid = {'alpha': alpha_list, 'l1_ratio': l1_ratio_list}

Import GridSearch

In [17]:
from sklearn.model_selection import GridSearchCV
#help(GridSearchCV)

Create a GridSearchCV object and run a grid search for the best parameters

In [18]:
grid_model = GridSearchCV(estimator=elastic_model,
                          param_grid=parameter_grid,
                          scoring='neg_mean_squared_error',
                          cv=5,
                          verbose=1)

In [19]:
grid_model.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

GridSearchCV(cv=5, estimator=ElasticNet(),
             param_grid={'alpha': [0.1, 1, 5, 10, 50, 100],
                         'l1_ratio': [0.1, 0.25, 0.5, 0.75, 0.87, 0.95, 0.99,
                                      1]},
             scoring='neg_mean_squared_error', verbose=1)

Display the best combination of parameters

In [20]:
grid_model.best_estimator_

ElasticNet(alpha=100, l1_ratio=1)

## Evaluate model's performance

In [21]:
y_predict = grid_model.predict(X_test)

In [22]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [23]:
mean_absolute_error(y_test,y_predict)

14709.790917305434

In [24]:
np.sqrt(mean_squared_error(y_test,y_predict))

22056.56997953435