# Lab - Regularization

## Week 4 Monday 11th January

In [1]:
## TASK: Regularized regression
## FUNCTIONS: Ridge, RidgeCV, Lasso, LassoCV
## DOCUMENTATION: http://scikit-learn.org/stable/modules/linear_model.html
## DATA: Crime (n=319 non-null, p=122, type=regression)
## DATA DICTIONARY: http://archive.ics.uci.edu/ml/datasets/Communities+and+Crime

## This data set contains data on violent crimes within a community.

########## Prepare data ##########
# read in data, remove categorical features, remove rows with missing values
import pandas as pd
crime = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data', header=None, na_values=['?'])
crime = crime.iloc[:, 5:]
crime.dropna(inplace=True)
crime.head()

# define X and y
X = crime.iloc[:, :-1]
y = crime.iloc[:, -1]

# split into train/test
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [2]:
# How many columns are in X?
crime.describe()

Unnamed: 0,5,6,7,8,9,10,11,12,13,14,...,118,119,120,121,122,123,124,125,126,127
count,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,...,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0
mean,0.229342,0.424483,0.337085,0.590094,0.209467,0.219185,0.420533,0.532821,0.366113,0.414734,...,0.161254,0.35326,0.30116,0.163103,0.076708,0.698589,0.440439,0.5879,0.195078,0.441191
std,0.243098,0.146542,0.311374,0.258285,0.245242,0.261124,0.122616,0.117848,0.131137,0.152828,...,0.208825,0.253366,0.295543,0.214778,0.140207,0.213944,0.405808,0.265967,0.164718,0.276351
min,0.0,0.04,0.0,0.0,0.01,0.01,0.06,0.04,0.03,0.06,...,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02
25%,0.08,0.34,0.07,0.42,0.06,0.04,0.35,0.47,0.29,0.31,...,0.05,0.18,0.09,0.04,0.02,0.62,0.0,0.4,0.11,0.21
50%,0.14,0.41,0.22,0.63,0.11,0.1,0.41,0.52,0.33,0.41,...,0.09,0.26,0.18,0.08,0.03,0.75,0.5,0.56,0.15,0.39
75%,0.255,0.48,0.56,0.81,0.24,0.295,0.47,0.575,0.4,0.49,...,0.175,0.47,0.385,0.195,0.06,0.84,1.0,0.79,0.22,0.65
max,1.0,1.0,1.0,0.98,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
########## Linear Regression Model Without Regularization ##########
# linear regression
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)
lm.coef_
# What are these numbers? coefficients for each feature

array([ -3.66188167e+00,   6.98124465e-01,  -2.61955467e-01,
        -2.85270027e-01,  -1.64740837e-01,   2.46972333e-01,
        -1.09290051e+00,  -5.96857796e-01,   1.11200239e+00,
        -7.21968931e-01,   4.27346598e+00,  -2.28040268e-01,
         8.04875769e-01,  -2.57934732e-01,  -2.63458023e-01,
        -1.04616958e+00,   6.07784197e-01,   7.73552561e-01,
         5.96468029e-02,   6.90215922e-01,   2.16759430e-02,
        -4.87802949e-01,  -5.18858404e-01,   1.39478815e-01,
        -1.24417942e-01,   3.15003821e-01,  -1.52633736e-01,
        -9.65003927e-01,   1.17142163e+00,  -3.08546690e-02,
        -9.29085548e-01,   1.24654586e-01,   1.98104506e-01,
         7.30804821e-01,  -1.77337294e-01,   8.32927588e-02,
         3.46045601e-01,   5.01837338e-01,   1.57062958e+00,
        -4.13478807e-01,   1.39350802e+00,  -3.49428114e+00,
         7.09577818e-01,  -8.32141352e-01,  -1.39984927e+00,
         1.02482840e+00,   2.13855006e-01,  -6.18937325e-01,
         5.28954490e-01,

In [4]:
# make predictions and evaluate
import numpy as np
from sklearn import metrics
preds = lm.predict(X_test)
print 'RMSE (no regularization) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
#Root Mean Square Error

RMSE (no regularization) = 0.233813676495


In [5]:
########## Ridge Regression Model ##########
# ridge regression (alpha must be positive, larger means more regularization)
from sklearn.linear_model import Ridge
rreg = Ridge(alpha=0.1, normalize=True)
rreg.fit(X_train, y_train)
rreg.coef_
preds = rreg.predict(X_test)
print 'RMSE (Ridge reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
# Is this model better? Why? Yes, less error, better prediction of data

RMSE (Ridge reg.) = 0.164279068049


In [6]:
# use RidgeCV to select best alpha
from sklearn.linear_model import RidgeCV
alpha_range = 10.**np.arange(-2, 3)
rregcv = RidgeCV(normalize=True, scoring='mean_squared_error', alphas=alpha_range)
rregcv.fit(X_train, y_train)
rregcv.alpha_
preds = rregcv.predict(X_test)
print 'RMSE (Ridge CV reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
# What is the range of alpha values we are searching over? 10^-2, 3

RMSE (Ridge CV reg.) = 0.163129782343


In [7]:
########## Lasso Regression Model ##########
# lasso (alpha must be positive, larger means more regularization)
from sklearn.linear_model import Lasso
las = Lasso(alpha=0.01, normalize=True)
las.fit(X_train, y_train)
las.coef_
preds = las.predict(X_test)
print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))

RMSE (Lasso reg.) = 0.198165225429


In [34]:
# try a smaller alpha
las = Lasso(alpha=0.002, normalize=True)
las.fit(X_train, y_train)
las.coef_
preds = las.predict(X_test)
print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))


RMSE (Lasso reg.) = 0.159936096803


In [9]:
# use LassoCV to select best alpha (tries 100 alphas by default)
from sklearn.linear_model import LassoCV
lascv = LassoCV(normalize=True, alphas=alpha_range)
lascv.fit(X_train, y_train)
lascv.alpha_
lascv.coef_
preds = lascv.predict(X_test)
print 'RMSE (Lasso CV reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))

RMSE (Lasso CV reg.) = 0.198165225429


### Lookup [Elastic Net](http://scikit-learn.org/stable/modules/linear_model.html#elastic-net) and complete the following.



1. What is elastic net?
2. How does it work?
3. Run elastic net on the above dataset

In [10]:
#Elastic Net is a linar regression model trained with L1 and L2 as regulariser. 

#This combination allows for learning a sparse model where few of the weights are non-zero like Lasso, 
#while still maintaining the regularization properties of Ridge. 

#Elastic-net is useful when there are multiple features which are correlated with one another. 
#Lasso is likely to pick one of these at random, while elastic-net is likely to pick both.

#A practical advantage of trading-off between Lasso and Ridge is it allows Elastic-Net to inherit 
#some of Ridge’s stability under rotation.

In [11]:
# Minimising ... something. Weights rows between lasso and ridge features?

In [43]:
#USe elasticnet on above data
from sklearn.linear_model import ElasticNet

enet = ElasticNet(alpha=0.01, l1_ratio=0.7)

enet.fit(X_train, y_train)
enet.coef_

preds = enet.predict(X_test)

print 'RMSE (ElasticNet reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))

RMSE (ElasticNet reg.) = 0.160945198724


In [22]:
#use ElasticNetCV to select best alpha
from sklearn.linear_model import ElasticNetCV
alpha_range = 10.**np.arange(-3, 4)
elascv = ElasticNetCV(normalize=True, alphas=alpha_range)
elascv.fit(X_train, y_train)
elascv.alpha_
elascv.coef_
preds = elascv.predict(X_test)
print 'RMSE (ElasticNet CV reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))

RMSE (ElasticNet CV reg.) = 0.163708913832
