In [1]:
# Introduction

# In this project, we will be using 'Medical Cost Personal Datasets'
# from Kaggle. We will test various regression models to 
# find out which of them gets the best r2 score

# Importing basic libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset

df = pd.read_csv('../input/insurance/insurance.csv')

# Checking for any missing values

df.isna().any().any()

# Displaying first five rows of the dataset

df.head()

# Moving all the categorical variables at the beginning 
# of dataset, so it is easier to encode

df = df[['sex', 'smoker', 'region', 'age', 'bmi', 'children', 'charges']]

# Separating the dependent and independent variables 

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Finding the unique values in 'region' category

print(df['region'].unique())

# Encoding first three categorical features of dataset

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 1, 2])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

print(X)

# Splitting the data into training set and test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

print(y_train)

# Multiple Linear Regression

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

# Training the regressor

regressor.fit(X_train, y_train)

# Predicting the dependent variable

y_pred = regressor.predict(X_test)

# Calculating the r2 score

from sklearn.metrics import r2_score
print('Multiple Linear Regression - ', r2_score(y_test, y_pred))

# Decision Tree Regression

from sklearn.tree import DecisionTreeRegressor
regressor_dt = DecisionTreeRegressor(random_state = 0)

regressor_dt.fit(X_train, y_train)

y_pred_dt = regressor_dt.predict(X_test)
print('Decision Tree - ', r2_score(y_test, y_pred_dt))

# Random Forest Regression

from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor(n_estimators = 10, random_state = 0)

regressor_rf.fit(X_train, y_train)

y_pred_rf = regressor_rf.predict(X_test)
print('Random Forest - ', r2_score(y_test, y_pred_rf))

# Polynomial Regression

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X_train)
regressor_pr = LinearRegression()

regressor_pr.fit(X_poly, y_train)

y_pred_pr = regressor_pr.predict(poly_reg.transform(X_test))

print('Polynomial Regression - ', r2_score(y_test, y_pred_pr))

# Support Vector Regression

y_train = y_train.reshape(len(y_train), 1)
y_test = y_test.reshape(len(y_test), 1)

# Regardless of the above regression models, 
# we need to apply Feature Scaling with 
# Support Vector Regression

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
y_train = sc_y.fit_transform(y_train)

from sklearn.svm import SVR
regressor_svm = SVR(kernel = 'rbf')

regressor_svm.fit(X_train, y_train)

y_pred_svm = sc_y.inverse_transform(regressor_svm.predict(sc_X.transform(X_test)).reshape(-1, 1))

print('Support Vector Regression - ', r2_score(y_test, y_pred_svm))

# Conclusion

# The best r2 score is 0.88 achieved with 
# Kernel SVM, followed by Random Forest, which 
# shows an r2 score of 0.86




















['southwest' 'southeast' 'northwest' 'northeast']
[[1.0 0.0 0.0 ... 19 27.9 0]
 [0.0 1.0 1.0 ... 18 33.77 1]
 [0.0 1.0 1.0 ... 28 33.0 3]
 ...
 [1.0 0.0 1.0 ... 18 36.85 0]
 [1.0 0.0 1.0 ... 21 25.8 0]
 [1.0 0.0 0.0 ... 61 29.07 0]]
[ 4562.8421 13616.3586  1837.237  ...  5415.6612  1646.4297  4766.022 ]
Multiple Linear Regression -  0.7958786376014414
Decision Tree -  0.7322987900922175
Random Forest -  0.8640082268503408
Polynomial Regression -  0.7940736669107992
Support Vector Regression -  0.8870944309282809


  y = column_or_1d(y, warn=True)
