In [3]:
import pandas  as pd #Data manipulation
import numpy as np #Data manipulation
import matplotlib.pyplot as plt # Visualization
import seaborn as sns #Visualization
plt.rcParams['figure.figsize'] = [8,5]
plt.rcParams['font.size'] =14
plt.rcParams['font.weight']= 'bold'
plt.style.use('seaborn-whitegrid')

# Import scikit learn linear regression
from sklearn.linear_model import LinearRegression

In [4]:
# Import dataset
#path ='dataset/'
path = '../insurance.csv'
df = pd.read_csv(path)
print('\nNumber of rows and columns in the data set: ',df.shape)
print('')

#Lets look into top few rows and columns in the dataset
df.head()


Number of rows and columns in the data set:  (1338, 7)



Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


Now we are concerned with using multiple linear regression to identify the a potential relationship.

Suppose we say that we suspect insurance charges are a function of age, sex, bmi, # of children, whether the person is a smoker and what region they live in. This would amount to a multiple linear regression problem:

y_i = a_0 + a_1*x_i1 + a_2*x_i2 + ....

where i is the ith obversation and is in {1, 2, ... n} where n is the number of observations, a_m is the mth coefficient, and x_ij is the jth column for the ith row. m and j are in {1, 2, ... k} meaning number of columns.

The goal of this problem is to use our data to solve for the coefficients a_m where m is in {1, 2, .. k}.

Specifically, we have 6 independent variables and the response variable is the charges

----------------------------------------
First, we have to do some data cleaning:

In [5]:
df['sex'].unique()

array(['female', 'male'], dtype=object)

In [6]:
df['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [7]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [22]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [9]:
# Use label encoder to convert strings into numbers
# We need to convert strings into a numeric representation for linear regression
# https://contactsunny.medium.com/label-encoder-vs-one-hot-encoder-in-machine-learning-3fc273365621
labelencoder = LabelEncoder()
df['sex'] = labelencoder.fit_transform(df['sex'])
df['smoker'] = labelencoder.fit_transform(df['smoker'])
df['region'] = labelencoder.fit_transform(df['region'])

In [10]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,3,16884.92400
1,18,1,33.770,1,0,2,1725.55230
2,28,1,33.000,3,0,2,4449.46200
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.880,0,0,1,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10600.54830
1334,18,0,31.920,0,0,0,2205.98080
1335,18,0,36.850,0,0,2,1629.83350
1336,21,0,25.800,0,0,3,2007.94500


In [11]:
# Now get the training and test set of the data
# Use all the columns in the df except charges as the features / columns for coefficients
# Use the charges as the output
# Make the test set 25% of the entire set. Training will be 75% of the data
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='charges'), df['charges'], test_size=0.25, random_state=1)

In [12]:
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region
1043,28,0,25.800,0,0,3
968,21,1,25.745,2,0,0
594,41,1,40.260,0,0,2
1079,63,1,33.660,3,0,2
1051,64,1,26.410,0,0,0
...,...,...,...,...,...,...
715,60,1,28.900,0,0,3
905,26,0,29.355,2,0,0
1096,51,0,34.960,2,1,0
235,40,0,22.220,2,1,2


In [13]:
X_test

Unnamed: 0,age,sex,bmi,children,smoker,region
559,19,1,35.530,0,0,1
1087,57,1,31.540,0,0,1
1020,51,1,37.000,0,0,3
460,49,0,36.630,3,0,2
802,21,1,22.300,1,0,3
...,...,...,...,...,...,...
1192,58,0,32.395,1,0,0
628,58,1,38.000,0,0,3
1098,52,0,30.875,0,0,0
1038,22,1,28.880,0,0,0


In [14]:
y_train

1043     3161.45400
968      3279.86855
594      5709.16440
1079    15161.53440
1051    14394.55790
           ...     
715     12146.97100
905      4564.19145
1096    44641.19740
235     19444.26580
1061    11554.22360
Name: charges, Length: 1003, dtype: float64

In [15]:
y_test

559      1646.42970
1087    11353.22760
1020     8798.59300
460     10381.47870
802      2103.08000
           ...     
1192    13019.16105
628     11365.95200
1098    23045.56616
1038     2250.83520
936     32108.66282
Name: charges, Length: 335, dtype: float64

In [16]:
# Now run the min max scaler to scale all of the features
# X_train, x_test = MinMaxScaler(X_train), MinMaxScaler(X_test)

In [17]:
# Run the regression
reg = LinearRegression().fit(X_train, y_train)

In [19]:
# Get the predictions of the test dataset
y_pred = reg.predict(X_test)

In [21]:
# The intercept
print('Interccept: \n', reg.intercept_)
# The coefficients
print('Coefficients: \n', reg.coef_)

Interccept: 
 -11074.74865855589
Coefficients: 
 [  252.07830816  -270.1850953    316.55473012   364.49255776
 24069.87438208  -259.23126409]


In [23]:
print('MSE: %.2f' % mean_squared_error(y_test, y_pred))
print('R^2: %.2f' % r2_score(y_test, y_pred))

MSE: 35807072.81
R^2: 0.73
