# Linear Regression (Project)


The dataset we will be working with later on in the next section:

* Ames Iowa Data Set: http://jse.amstat.org/v19n3/decock.pdf

---------

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [2]:
df = pd.read_csv('Ames_Housing_Data_dummy.csv')

In [3]:
df.head()

Unnamed: 0,PID,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,...,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD,Sale Condition_AdjLand,Sale Condition_Alloca,Sale Condition_Family,Sale Condition_Normal,Sale Condition_Partial
0,526301100,141.0,31770,6,5,1960,1960,112.0,639.0,0.0,...,0,0,0,0,1,0,0,0,1,0
1,526350040,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,...,0,0,0,0,1,0,0,0,1,0
2,526351010,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,...,0,0,0,0,1,0,0,0,1,0
3,526353030,93.0,11160,7,5,1968,1968,0.0,1065.0,0.0,...,0,0,0,0,1,0,0,0,1,0
4,527105010,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,...,0,0,0,0,1,0,0,0,1,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2923 entries, 0 to 2922
Columns: 274 entries, PID to Sale Condition_Partial
dtypes: float64(11), int64(263)
memory usage: 6.1 MB


In [5]:
X = df.drop(columns=['PID','SalePrice'],axis=1)

In [6]:
X

Unnamed: 0,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,...,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD,Sale Condition_AdjLand,Sale Condition_Alloca,Sale Condition_Family,Sale Condition_Normal,Sale Condition_Partial
0,141.000000,31770,6,5,1960,1960,112.0,639.0,0.0,441.0,...,0,0,0,0,1,0,0,0,1,0
1,80.000000,11622,5,6,1961,1961,0.0,468.0,144.0,270.0,...,0,0,0,0,1,0,0,0,1,0
2,81.000000,14267,6,6,1958,1958,108.0,923.0,0.0,406.0,...,0,0,0,0,1,0,0,0,1,0
3,93.000000,11160,7,5,1968,1968,0.0,1065.0,0.0,1045.0,...,0,0,0,0,1,0,0,0,1,0
4,74.000000,13830,5,5,1997,1998,0.0,791.0,0.0,137.0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2918,37.000000,7937,6,6,1984,1984,0.0,819.0,0.0,184.0,...,0,0,0,0,1,0,0,0,1,0
2919,75.144444,8885,5,5,1983,1983,0.0,301.0,324.0,239.0,...,0,0,0,0,1,0,0,0,1,0
2920,62.000000,10441,5,5,1992,1992,0.0,337.0,0.0,575.0,...,0,0,0,0,1,0,0,0,1,0
2921,77.000000,10010,5,5,1974,1975,0.0,1071.0,123.0,195.0,...,0,0,0,0,1,0,0,0,1,0


In [7]:
y = df['SalePrice']

#### Use scikit-learn to split up X and y into a training set and test set

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.10,random_state=101)

In [9]:
len(X_train),len(X_test)

(2630, 293)

#### The dataset features has a variety of scales and units. 
#### For optimal regression performance, scale the X features.

In [10]:
scaler = StandardScaler()

In [11]:
scaler.fit(X_train)

StandardScaler()

In [12]:
X_train = scaler.transform(X_train)

In [13]:
X_test = scaler.transform(X_test)

#### We will use an Elastic Net model. Create an instance of default ElasticNet model with scikit-learn**

In [14]:
model = ElasticNet(max_iter=10000000)

#### The Elastic Net model has two main parameters, alpha and the L1 ratio.
#### Create a dictionary parameter grid of values for the ElasticNet. Feel free to play around with these values, keep in mind, you may not match up exactly with the solution choices**

In [15]:
param_grid = {'alpha':[100,150,200,300],
       "l1_ratio":[0.7,0.9,1]}

#### Using scikit-learn create a GridSearchCV object and run a grid search for the best parameters for your model based on your scaled training data.

In [16]:
grid_model = GridSearchCV(model,param_grid=param_grid,
                          cv=10,scoring='neg_mean_squared_error')

In [17]:
grid_model.fit(X_train,y_train)

GridSearchCV(cv=10, estimator=ElasticNet(max_iter=10000000),
             param_grid={'alpha': [100, 150, 200, 300],
                         'l1_ratio': [0.7, 0.9, 1]},
             scoring='neg_mean_squared_error')

#### Display the best combination of parameters for your model

In [18]:
grid_model.best_estimator_

ElasticNet(alpha=100, l1_ratio=1, max_iter=10000000)

#### Evaluate your model's performance on the unseen 10% scaled test set. 
#### In the solutions notebook we achieved an MAE of $\$$14149 and a RMSE of $\$$20532

In [19]:
y_predict = grid_model.predict(X_test)

In [20]:
np.sqrt(mean_squared_error(y_predict,y_test))

24183.974799029427

In [21]:
mean_absolute_error(y_predict,y_test)

15362.303579886904

#### Mean Price

In [22]:
df['SalePrice'].mean()

180814.82962709546

#### Percentage Error

In [23]:
24183/180814*100

13.374517459931202

In [24]:
15362/180814*100

8.496023538000376

### Aproximately 14% off from data 

In [25]:
from sklearn.linear_model import LassoCV

In [26]:
model = LassoCV(eps=0.001,n_alphas=100,max_iter=1000,cv=20)

In [27]:
model.fit(X_train,y_train)

LassoCV(cv=20)

In [28]:
model.alpha_

105.29249079550004

In [29]:
test_prediction = model.predict(X_test)

In [30]:
np.sqrt(mean_squared_error(y_test,test_prediction))

24172.711264337126

### Same as above 14%

--------------

## Cross validate error

In [31]:
cross_val_model = ElasticNet(alpha=105.29,l1_ratio=1)

In [32]:
scores = cross_validate(cross_val_model ,X_train,y_train,
                        scoring=['neg_mean_absolute_error',
                                 'neg_mean_squared_error','max_error'],cv=200)

In [33]:
abs(pd.DataFrame(scores).mean())

fit_time                        3.117235e-01
score_time                      1.104645e-03
test_neg_mean_absolute_error    1.471211e+04
test_neg_mean_squared_error     4.795307e+08
test_max_error                  5.126520e+04
dtype: float64

In [34]:
np.sqrt( 4.795307e+08)

21898.189422872387

In [35]:
(21898/180814.82)*100

12.110732958725396

### Hence we can see that Value is much more below the 14% rang hence it is aprox 10 to 14% 

## Great work!

----