In [1]:
# Python packages

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.pandas.set_option('display.max_columns',None)

In [2]:
# Loading dataset

dataset = pd.read_csv("https://raw.githubusercontent.com/iamaakashpal/Insurance-Premium-Prediction/main/dataset/archive.zip")

In [3]:
# Displaying dataset

dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95


In [4]:
# Duplicate value

dataset[dataset.duplicated()]

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
581,19,male,30.6,0,no,northwest,1639.56


In [5]:
# Dropping Duplicate Values

dataset.drop_duplicates(inplace=True)

In [6]:
# Shape of dataset

dataset.shape   

(1337, 7)

In [7]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [8]:
# Splitting Data into Independent and Dependent Variable

x = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]

In [9]:
x.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.8,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.7,0,no,northwest
4,32,male,28.9,0,no,northwest


In [10]:
y.head()

0    16884.92
1     1725.55
2     4449.46
3    21984.47
4     3866.86
Name: expenses, dtype: float64

In [11]:
x.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
dtype: object

In [12]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [13]:
x['sex']= encoder.fit_transform(x['sex'])
x['smoker']= encoder.fit_transform(x['smoker'])
x['region']= encoder.fit_transform(x['region'])

In [14]:
x

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.9,0,1,3
1,18,1,33.8,1,0,2
2,28,1,33.0,3,0,2
3,33,1,22.7,0,0,1
4,32,1,28.9,0,0,1
...,...,...,...,...,...,...
1333,50,1,31.0,3,0,1
1334,18,0,31.9,0,0,0
1335,18,0,36.9,0,0,2
1336,21,0,25.8,0,0,3


In [15]:
y

0       16884.92
1        1725.55
2        4449.46
3       21984.47
4        3866.86
          ...   
1333    10600.55
1334     2205.98
1335     1629.83
1336     2007.95
1337    29141.36
Name: expenses, Length: 1337, dtype: float64

In [16]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=7)

In [17]:
x_train.shape

(1069, 6)

In [18]:
x_test.shape

(268, 6)

In [19]:
y_train.shape

(1069,)

In [20]:
y_test.shape

(268,)

In [21]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [22]:
x_train = scaler.fit_transform(x_train)

In [23]:
x_test = scaler.transform(x_test)

In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

In [25]:
# regression=LinearRegression()
gbr = GradientBoostingRegressor()

In [26]:
# regression.fit(x_train,y_train)
gbr.fit(x_train,y_train)

GradientBoostingRegressor()

In [27]:
# print(regression.coef_)

In [28]:
# print(regression.intercept_)

In [29]:
# regression.get_params()

In [30]:
# reg_pred=regression.predict(x_test)
gbr_pred=gbr.predict(x_test)

In [31]:
from sklearn.metrics import r2_score
# score=r2_score(y_test,reg_pred)
score=r2_score(y_test,gbr_pred)
print(score)

0.8506951338867863
