In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, explained_variance_score, mean_absolute_error, mean_squared_error
from sklearn import preprocessing
from sklearn import utils
from math import sqrt

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [5]:
data = pd.read_csv(r"C:\Users\Arti-PC\Downloads\mldataset\insurance.csv", sep = ',')

data['sex'] =data['sex'].astype('category')
data['region'] =data['region'].astype('category')
data['smoker'] =data['smoker'].astype('category')

data.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
def Bmi_Classifier(row):
    if row['bmi'] < 18.5:
        return "underweight"
    elif row['bmi'] < 25:
        return "normal"
    elif row['bmi'] < 30:
        return "overweight"
    else:
        return "obese"

def Age_Classifier(row):
    if row['age'] < 17:
        return "child"
    elif row['age'] < 31:
        return "adult"
    elif row['age'] < 46:
        return "middle-aged"
    else:
        return "old-aged"

data['bmi_status'] = data.apply(Bmi_Classifier, axis=1)
data['age_status'] = data.apply(Age_Classifier, axis=1)

data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,bmi_status,age_status
0,19,female,27.9,0,yes,southwest,16884.924,overweight,adult
1,18,male,33.77,1,no,southeast,1725.5523,obese,adult
2,28,male,33.0,3,no,southeast,4449.462,obese,adult
3,33,male,22.705,0,no,northwest,21984.47061,normal,middle-aged
4,32,male,28.88,0,no,northwest,3866.8552,overweight,middle-aged


In [7]:
data2 = data[data['smoker'] == 'yes'][['sex', 'children', 'region', 'charges','bmi_status', 'age_status']]
new_data = pd.get_dummies(data2, columns = ['sex','children','region','bmi_status','age_status'])
new_data.head()

Unnamed: 0,charges,sex_female,sex_male,children_0,children_1,children_2,children_3,children_4,children_5,region_northeast,region_northwest,region_southeast,region_southwest,bmi_status_normal,bmi_status_obese,bmi_status_overweight,bmi_status_underweight,age_status_adult,age_status_middle-aged,age_status_old-aged
0,16884.924,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0
11,27808.7251,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1
14,39611.7577,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0
19,36837.467,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0
23,37701.8768,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0


In [8]:
X = new_data.drop('charges', axis = 1).values
y = new_data['charges']

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.80, test_size = 0.20, random_state = 15)

lm = LinearRegression(fit_intercept = True)
lm.fit(X_train, y_train);

y_pred = lm.predict(X_train)

In [9]:
print('The accuracy on the training data:',lm.score(X_train, y_train).round(3))
print('The accuracy on the testing data:',lm.score(X_test, y_test).round(3))
print(' ')
print('Root Mean Squared Error (RMSE)')
print('The RMSE on the training dataset:', round(sqrt(mean_squared_error(y_train, y_pred)),3))
print('The RMSE on the testing dataset:', round(sqrt(mean_squared_error(y_test, lm.predict(X_test))),3))
print(' ')
print('Mean Absolute Error (MAE)')
print('The MAE on the training dataset:', mean_absolute_error(y_train, y_pred).round(3))
print('The MAE on the testing dataset:', mean_absolute_error(y_test, lm.predict(X_test)).round(3))

The accuracy on the training data: 0.879
The accuracy on the testing data: 0.802
 
Root Mean Squared Error (RMSE)
The RMSE on the training dataset: 3985.45
The RMSE on the testing dataset: 5201.759
 
Mean Absolute Error (MAE)
The MAE on the training dataset: 2644.327
The MAE on the testing dataset: 3336.527
