In [1]:
#Importing the Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [2]:
companies = pd.read_csv("insuranceData.csv") #Reading the dataset in the csv and showing it using pandas
X = companies.iloc[:, :-1].values #Import all 'X' data except for the last row
y = companies.iloc[:, 4].values #Sets Y equal to the last row

companies.head()

Unnamed: 0,age,bmi,children,smoker,charges
0,19,27.9,0,yes,16884.924
1,18,33.77,1,no,1725.5523
2,28,33.0,3,no,4449.462
3,33,22.705,0,no,21984.47061
4,32,28.88,0,no,3866.8552


In [3]:
#Encoding Categorical Data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3]) #Chooses the "Smoker" section and allows us to passover the label to the onehotencoder
onehotencoder = ColumnTransformer([("Smoker", OneHotEncoder(), [3])], remainder="passthrough") #This sets the categorical data in column 3 to numerical data
X = onehotencoder.fit_transform(X)

In [4]:
#Avoiding the Dummy Variable Trap
X = X[:, 1:]

In [5]:
#Splitting the dataset into Training and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 0)

#test_size sets aside 15% of all items for testing - random_state at value 0 randomizes the 20% of values it sets aside

In [6]:
#Fitting Multiple Linear Regression to the Training Set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [7]:
#Predicting the Test set Results
y_pred = regressor.predict(X_test)
y_pred

array([ 1.14589037e+04,  9.92563631e+03,  3.78118827e+04,  1.58808943e+04,
        6.97360055e+03,  3.68353076e+03,  1.15342243e+03,  1.39982132e+04,
        8.75503190e+03,  7.23945973e+03,  4.22918693e+03,  1.01339013e+04,
        9.07255566e+03,  4.34700013e+03,  2.78291848e+04,  1.11703666e+04,
        1.12276150e+04,  5.55254705e+03,  8.02828014e+03,  2.67296924e+04,
        3.36425305e+04,  1.40398303e+04,  1.12405201e+04,  3.27140150e+04,
        4.50454593e+03,  8.59659654e+03,  8.77241144e+02,  1.01373043e+04,
        4.28492066e+03,  1.01976920e+04,  8.78895560e+03,  4.03641538e+04,
        1.58290309e+04,  1.40146947e+04,  2.45685410e+04,  4.71532278e+03,
        1.28534755e+04,  3.09849097e+04,  3.33370702e+04,  3.84806679e+03,
        3.71745446e+03,  4.46372948e+03,  3.02320057e+04,  3.92297404e+04,
        2.81489195e+04,  4.76793034e+03,  1.10671287e+04,  7.69629560e+03,
        3.27251571e+03,  1.07866355e+04,  5.10182957e+03,  3.20203963e+03,
        3.34514718e+04,  

In [8]:
#Calculating the Coefficients
print(regressor.coef_)

[23699.70723512   254.72171877   312.84193584   429.55191964]


In [9]:
#Calculating the Intercept
print(regressor.intercept_)

-11664.004052172022


In [10]:
#Calculating the R Scored Value
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.7904465130093584