# A comprehensive beginners guide for Linear, Ridge and Lasso Regression

URL: https://www.analyticsvidhya.com/blog/2017/06/a-comprehensive-guide-for-linear-ridge-and-lasso-regression/?utm_source=feedburner&utm_medium=email&utm_campaign=Feed%3A+AnalyticsVidhya+%28Analytics+Vidhya%29

 SHUBHAM JAIN , JUNE 22, 2017

## Model 3 - Entering Linear Regression

In [58]:
import pandas as pd
import numpy as np

In [99]:
# Import test and train data
train = pd.read_csv("./data/Train_UWu5bXk.csv")
# print("Train dataset: \n{}".format(train))

test = pd.read_csv("./data/Test_u94Q5KV.csv")
# print("Test dataset: \n{}".format(test))

In [101]:
# train

# test

In [21]:
# import linear regression from sklearn
from sklearn.linear_model import LinearRegression

lreg = LinearRegression()

In [59]:
from sklearn.model_selection import train_test_split

# splitting into training CSV from cross validation
X = train.loc[:, ['Outlet_Establishment_Year', 'Item_MRP']]

# X

x_train, x_cv, y_train, y_cv = train_test_split(X, train.Item_Outlet_Sales)
# x_train
# x_cv
# y_train
# y_cv

In [51]:
# train the model
lreg.fit(x_train, y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [55]:
# predicting on cv
pred = lreg.predict(x_cv)

# pred
# help(lreg)

array([ 2219.09581475,  1323.78582469,  1487.23520509, ...,  3001.99582591,
        2395.94504095,  2989.31712818])

In [60]:
# calculate MSE
mse = np.mean((pred - y_cv)**2)

# mse

4170970.4994674935

In [66]:
# calculating the coefficients
coeff = pd.DataFrame(x_train.columns)

coeff['Coefficient Estimate'] = pd.Series(lreg.coef_)

# coeff

## Evaluating Model: R-Sruqare

$R_{square} = 1 - \frac{\sum (Y_{actual} - Y_{predict})^2}{\sum (Y_{actual} - Y_{mean})^2}$

In [67]:
# check r-sruqare
lreg.score(x_cv, y_cv)


0.31821387987421401

## Model 4 - Linear Regress with more variables

In [70]:
# Error Raise
X = train.loc[:, ['Outlet_Establishment_Year', 'Item_MRP', 'Item_Weight']]

# splitting into training and cv for cross validation
x_train, x_cv, y_train, y_cv = train_test_split(X, train.Item_Outlet_Sales)

# train the model
lreg.fit(x_train, y_train)

In [76]:
# impute Nan with other non-null entries
train['Item_Weight'].fillna(train['Item_Weight'].mean(), inplace=True)

X = train.loc[:, ['Outlet_Establishment_Year', 'Item_MRP', 'Item_Weight']]

# splitting into training and cv for cross validation
x_train, x_cv, y_train, y_cv = train_test_split(X, train.Item_Outlet_Sales)

# train the model
lreg.fit(x_train, y_train)

# predicting on cv
lreg.predict(x_cv)

# calculate MSE
mse = np.mean((pred - y_cv)**2)

# calculating coefficients
coeff = pd.DataFrame(x_train.columns)
coeff['Coefficient Estimate'] = pd.Series(lreg.coef_)

# calculate r-square
lreg.score(x_cv, y_cv)

0.29911102739932671

## Adjust R-square

$R^2_{adjusted} = 1 - \frac{(1 - R^2)(N - 1)}{N- p - 1}

where 
    $R^2$ = sample $R^2$,
    $p$ = number of predictors,
    $N$ = total sample size

# Using all the features for prediction
## Data pre-processing steps for regression model

In [138]:
path = './data/'
trainfile = 'Train_UWu5bXk.csv'
testfile =  'Test_u94Q5KV.csv'

# import train and test datasets
train = pd.read_csv(path + trainfile)
test = pd.read_csv(path + testfile)

# # imputing missing values
train['Item_Visibility'] = train['Item_Visibility'].replace(0.0, np.mean(train['Item_Visibility']))
train['Outlet_Establishment_Year'] = 2013 - train['Outlet_Establishment_Year']
train['Outlet_Size'].fillna('Small', inplace=True)

# creating dummy variables to convert categorical into numerical values
mylist = list(train.select_dtypes(include=['object']).columns)
dummies = pd.get_dummies(train[mylist], prefix = mylist)

# train.drop(mylist, axis = 1, inplace = True)
X = pd.concat([train, dummies], axis = 1)


## Building the model

In [142]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

% matplotlib inline

path = './data/'
trainfile = 'Train_UWu5bXk.csv'
testfile =  'Test_u94Q5KV.csv'

# import train and test datasets
train = pd.read_csv(path + trainfile)
test = pd.read_csv(path + testfile)

# import linear regression from sklearn
from sklearn.linear_model import LinearRegression

X = train.drop('Item_Outlet_Sales', 1)
x_train, x_cv, y_train, y_cv = train_test_split(X, train.Item_Outlet_Sales)


# training a linear regression model on train
# lreg.fit(x_train, y_train)

# # predicting on cv
# pred_cv = lreg.predict(x_cv)

# # calculating MSE
# mse = np.mean((pred_cv - y_cv)**2)

# print("MSE= {}".format(mse))

# # evaluating with r-square
# print("R-square= {}",format(lreg.score(x_cv, y_cv)))


## Interpretation of Regression Plots

In [140]:
# residual plot
x_plot = plt.scatter(pred_cv, (pred_cv - y_cv), c = 'b')

plt.hlines(y = 0, xmin = -1000, xmax = 5000)
plt.title('Residual Plot')


NameError: name 'pred_cv' is not defined

## Regulariztion

In [None]:
# check the magnitude of coefficients
predictaors = x_train.columns

# checking the magnitude of coefficients
coef = pd.Series(lreg.coef_, predictors).sort_values()

coef.plot(kind='bar', title='Modal Coefficients')

## Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

# training the model
ridgeReg = Ridge(x_train, y_train)
pred = ridgeReg.predict(x_cv)

# calculating MSE
mse = np.mean((pred_cv - y_cv)**2)
mse

# calculating r-square (scre)
ridgeReg.score(x_cv, y_cv)

# Elastic Net Regression

In [1]:
from sklearn.linear_model import ElasticNet

ENReg = ElasticNet(alpha = 1, l1_ratio=0.5, normalize=False)

ENReg.fit(x_train, y_train)

pred_cv = ENReg.prodict(x_cv)

# calculating MSE
mse = np.mean((pred_cv - y_cv)**2)

# calculating score
ENReg.score(x_cv, y_cv)
