## MODEL 6: LASSO REGRESSION

In [6]:
# Least Absolute Shrinkage and Selection Operator is a type of linear regression
# that adds L1 regularization to the cost function
# it shrinks some coefficients to zero --- perform feature selection
# when you suspect multicollinearity
# you want a simpler model that uses fewer features

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score

In [9]:
url = "https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv"
df = pd.read_csv(url)
df.head()
df.shape

(1338, 7)

In [12]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [14]:
df.smoker.value_counts()

smoker
no     1064
yes     274
Name: count, dtype: int64

In [16]:
df.region.value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [18]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [20]:
df.age.value_counts()

age
18    69
19    68
50    29
51    29
47    29
46    29
45    29
20    29
48    29
52    29
22    28
49    28
54    28
53    28
21    28
26    28
24    28
25    28
28    28
27    28
23    28
43    27
29    27
30    27
41    27
42    27
44    27
31    27
40    27
32    26
33    26
56    26
34    26
55    26
57    26
37    25
59    25
58    25
36    25
38    25
35    25
39    25
61    23
60    23
63    23
62    23
64    22
Name: count, dtype: int64

## ONE HOT ENCODING

In [23]:
df = pd.get_dummies(df, drop_first = True)
df

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.900,0,16884.92400,False,True,False,False,True
1,18,33.770,1,1725.55230,True,False,False,True,False
2,28,33.000,3,4449.46200,True,False,False,True,False
3,33,22.705,0,21984.47061,True,False,True,False,False
4,32,28.880,0,3866.85520,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,True,False,True,False,False
1334,18,31.920,0,2205.98080,False,False,False,False,False
1335,18,36.850,0,1629.83350,False,False,False,True,False
1336,21,25.800,0,2007.94500,False,False,False,False,True


In [25]:
X = df.drop('charges', axis = 1)
y = df['charges']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
model = Lasso(alpha = 1000) # higher aplha strengthens the regularization, so more coeficients shrink to zero
model.fit(X_train, y_train)

In [29]:
y_pred = model.predict(X_test)

In [33]:
mse = mean_squared_error(y_test, y_pred)
mse

42198156.44689197

In [35]:
r2 = r2_score(y_test, y_pred)
r2

0.7281900081152803

In [37]:
coef = model.coef_
coef

array([  246.4504856 ,   301.19485661,     0.        ,     0.        ,
       17528.13654862,    -0.        ,     0.        ,    -0.        ])

In [39]:
coef_series = pd.Series(model.coef_, index = X.columns)

In [41]:
zero_coef_features = coef_series[coef_series == 0]
zero_coef_features

children            0.0
sex_male            0.0
region_northwest   -0.0
region_southeast    0.0
region_southwest   -0.0
dtype: float64