In [2]:
# -*- coding: utf-8 -*-

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#importing the data set
dataset = pd.read_csv('D:\datascience-practice\insurance.csv')

x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,6].values

#Encoding the categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

#for encoding places
labelencoder = LabelEncoder()
x[:, 5] = labelencoder.fit_transform(x[:, 5])

#for encoding smoking
labelencoder_place = LabelEncoder()
x[:, 4] = labelencoder_place.fit_transform(x[:, 4])

#for encoding gender
labelencoder_gender = LabelEncoder()
x[:, 1] = labelencoder_place.fit_transform(x[:, 1])

#converting the region attributes
onehotencoder = OneHotEncoder(categorical_features = [5])
x = onehotencoder.fit_transform(x).toarray()
#Avoding the dummy variable trap
x = x[:, 1:]


#converting the smoking attributes
onehotencoder_smoke = OneHotEncoder(categorical_features = [7])
x = onehotencoder_smoke.fit_transform(x).toarray()
x = x[:, 1:]

#converting the gender attributes
onehotencoder_gender = OneHotEncoder(categorical_features = [5])
x = onehotencoder_gender.fit_transform(x).toarray()
x = x[:, 1:]



from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
sc_y = StandardScaler()

x = sc_x.fit_transform(x)

y = sc_y.fit_transform(y.reshape(-1,1))

x = np.delete(x,1,axis=1)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)


In [3]:
# Our task is a regression task where 
# we are predicting the insurance amount claimed by people depending 
# on their age, their location and other features. 
# Trained a model using Linear regression.

from sklearn.linear_model import LinearRegression
regresser = LinearRegression()
regresser.fit(x_train, y_train)

#predicting the test set results
y_pred = regresser.predict(x_test)



In [4]:
#  After building a machine learning model, we need to validate the model using some metrics.
#  For regression tasks, these are the metrics we generally used.

#  1. R squared error: It measures how well our data fit into the model and how 
#  accurately the model is predicting the unseen data.

#     R^2 = 1 - (SSE/TSS) 
#               where SSE is defined as the sum of squared errors (actual target - predicted target)
#               where TSS is defined as the total sum of squares (Actual target - mean of target)
# The possible values of this be 0 to 1. The more closer to zero, the better the model is.

# Here is the python code snippet to calculate R^2 error 
regresser.score(x_test,y_test)

0.16434435894991628

In [5]:

# 2. Max error metric: defines as the max difference between the true target and predicted target.
# lower the value, better the model is

# Here is the python code snippet to calculate max error metric
arr = np.setdiff1d(y_test,y_pred)
arr = np.absolute(arr)
np.max(arr)


3.248148025811295

In [6]:

# 3. Root Mean Squared Error(RMSE): defined as the square root of mean squared error. 
#  where mean suared error is defined the mean of sum of the square difference 
#  between actual target and predicted target.

# RMSE is a measure of the average deviation of the estimates from the observed values. 
# lower the value, better the model is.

# Here is the python code snippet to calculate RMSE
from sklearn.metrics import mean_squared_error
import math
math.sqrt(mean_squared_error(y_test,y_pred))


0.9525931660665768