# 1.) Import the data from CCLE into a new Google Colab file

In [1]:
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
from sklearn import preprocessing
import numpy as np

In [2]:
drive.mount('/content/gdrive/', force_remount = True)

Mounted at /content/gdrive/


In [3]:
df = pd.read_csv('gdrive/MyDrive/ML/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.loc[df['sex'] == 'female','sex'] = 1
df.loc[df["sex"] == 'male','sex'] = 0
df.loc[df['smoker'] == 'yes','smoker'] = 1
df.loc[df["smoker"] == 'no','smoker'] = 0
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,1,southwest,16884.924
1,18,0,33.77,1,0,southeast,1725.5523
2,28,0,33.0,3,0,southeast,4449.462
3,33,0,22.705,0,0,northwest,21984.47061
4,32,0,28.88,0,0,northwest,3866.8552


In [5]:
le = preprocessing.LabelEncoder()
le.fit(df['region'])
df['e_region'] = le.transform(df['region'])

In [6]:
df.groupby(['region','e_region']).size()

region     e_region
northeast  0           324
northwest  1           325
southeast  2           364
southwest  3           325
dtype: int64

In [7]:
data = df[['age', 'sex', 'bmi', 'children', 'smoker', 'e_region']]
target = df[['charges']]

In [8]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,e_region
0,19,1,27.9,0,1,3
1,18,0,33.77,1,0,2
2,28,0,33.0,3,0,2
3,33,0,22.705,0,0,1
4,32,0,28.88,0,0,1


In [9]:
target.head()

Unnamed: 0,charges
0,16884.924
1,1725.5523
2,4449.462
3,21984.47061
4,3866.8552


# 2.) Split the data into 80/20, in/out sample

In [10]:
cut = int((len(df) * .8) //1)

In [11]:
in_data = data[:cut]
out_data = data[cut:]
in_target = target[:cut]
out_target = target[cut:]

# 3.) Normalize the Data

In [12]:
scaler = preprocessing.StandardScaler().fit(in_data)
in_data_scale = scaler.transform(in_data)
out_data_scale = scaler.transform(out_data)

In [13]:
in_data

Unnamed: 0,age,sex,bmi,children,smoker,e_region
0,19,1,27.900,0,1,3
1,18,0,33.770,1,0,2
2,28,0,33.000,3,0,2
3,33,0,22.705,0,0,1
4,32,0,28.880,0,0,1
...,...,...,...,...,...,...
1065,42,1,25.300,1,0,3
1066,48,0,37.290,2,0,2
1067,39,0,42.655,0,0,0
1068,63,0,21.660,1,0,1


In [14]:
in_data_scale

array([[-1.45390122,  1.02077653, -0.47403991, -0.90286618,  1.99417757,
         1.34066165],
       [-1.52464584, -0.97964634,  0.4911621 , -0.06353793, -0.50145986,
         0.4373191 ],
       [-0.81719959, -0.97964634,  0.36455128,  1.61511857, -0.50145986,
         0.4373191 ],
       ...,
       [-0.03900872, -0.97964634,  1.95211949, -0.90286618, -0.50145986,
        -1.36936599],
       [ 1.65886228, -0.97964634, -1.50008089, -0.06353793, -0.50145986,
        -0.46602344],
       [ 1.02216066,  1.02077653,  0.18367867, -0.06353793, -0.50145986,
         0.4373191 ]])

# 4.) Get lambda from Lasso cross validation

In [15]:
from sklearn.linear_model import LassoCV

In [16]:
modCV = LassoCV().fit(in_data_scale, in_target)

  y = column_or_1d(y, warn=True)


In [17]:
a = modCV.alpha_

In [18]:
a

133.34880015958146

# 5.) Run a lambda regression with that Lambda

In [19]:
from sklearn.linear_model import Lasso

In [20]:
model = Lasso(alpha = a).fit(in_data_scale, in_target)

# 6.) Visualize the coefficients 

In [21]:
model.coef_

array([3564.26137833,    0.        , 1890.8527485 ,  398.24050447,
       9324.84227138, -326.52198757])

# 7.) Interpret the coefficients

The magnitudes of the coefficients are generally high, however, the coefficient for $age$ is 0. All the variables have a positive relationship with $charges$, yet the variable for the region has a negative association with the target. The most important variable to explain $charges$ seems to be the dummy for smokers.

# 8.) Compare in and out of sample MSE’s

In [22]:
#mod1.predict("in sample data")
#mod1.predict("out of sample data")

In [23]:
in_predict = model.predict(np.array(in_data_scale))
in_predict

array([24933.78011265,  3864.47297708,  6815.10290666, ...,
       12177.79520243, 11741.14195836, 12360.55112834])

In [24]:
out_predict = model.predict(np.array(out_data_scale))
out_predict

array([32952.80313085, 14744.12497151,  3799.69238402, 12345.77306264,
        9911.6084389 ,  6094.99247976, 10331.61480604,  2470.13710918,
       28673.36061393, 15845.63670241,   470.88966821,  5844.30269417,
        4905.67365256,  6393.87366329, 14570.1538115 , 28663.34178477,
       11958.95304947, 12965.78642268, 16781.09641562,  9188.70008979,
       34866.46919685, 12224.78301872,  5166.10635585, 27057.44013644,
       12619.38614283,  4704.75126934, 36751.08789356,  4538.8278103 ,
       11793.2293104 ,  5697.94001377, 27279.72310631, 11455.93619318,
        8245.564964  , 14334.51914456,  6791.83001513, 11821.09454125,
       10446.44446989, 10215.17594893,  4414.59128802,  7168.65270048,
       13074.83843409, 34283.86863971, 32906.5524376 ,  5031.61575451,
        2501.85516656, 12852.10589148, 10306.8495343 , 29098.41782478,
       31533.78962903,  3556.96398613, 27030.58230801, 12627.00673087,
       37885.41912957,  6296.26206066, 31778.57440314, 12470.20732982,
      

In [25]:
from sklearn.metrics import mean_squared_error

In [26]:
mean_squared_error(in_target, in_predict)

36490415.101693384

In [27]:
mean_squared_error(out_target, out_predict)

37252730.724018715

The in-sample model has a lower MSE score.