In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas_profiling as pp
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
insurance = pd.read_csv('../input/insurance/insurance.csv')

In [4]:
insurance.head()

In [5]:
insurance.info()

In [6]:
insurance_profile = pp.ProfileReport(insurance)
insurance_profile.to_file(output_file="insurance_profile.html")

In [7]:
# HTLM Report
#insurance_profile.to_notebook_iframe()

In [8]:
insurance = insurance.drop_duplicates()

In [9]:
age = insurance['age']
charges = insurance['charges']

plt.scatter(age, charges)

plt.show()

In [10]:
smoker = insurance.smoker.map(dict(yes=1, no=0))
sex = insurance.sex.map(dict(female=1, male=0))

colours = {0:'blue', 1:'red'}

plt.scatter(age, charges, c=smoker.map(colours)) #put sex in instead of smoker to see if gender had an effect

plt.show()


In [11]:
#groups bmi based on underweight/healhty/overweight uses 0,1,2 as values

bmi = insurance['bmi']
criteria = [bmi.between(0, 18.5), bmi.between(18.5, 25), bmi.between(25, 200)]
values = [0, 1, 2]

bmi = np.select(criteria, values, 0)

insurance['bmiMap'] = bmi

bmiColours = {0:'blue', 1:'green', 2:'red'}

plt.scatter(age, charges, c=insurance['bmiMap'].map(bmiColours))
plt.show()

In [12]:
#some ways to group data - useful for when you do preprocessing
#insurance['bmi'] = np.where(insurance['bmi'] > 15, 1000, insurance['bmi'])
#insurance.smoker.replace(('yes', 'no'), (1, 0), inplace=True)  how to replace values

In [13]:
#using bmi and smoking to check for correlation
bmi = insurance['bmi']

plt.scatter(bmi, charges)

plt.scatter(bmi, charges, c=smoker.map(colours))
plt.show()

In [14]:
#mapping charges to low, medium or high
charges = insurance['charges']
chargesCriteria = [charges.between(0, 12500), charges.between(12500, 30000), charges.between(30000, 70000)]
values = [0, 1, 2]

chargesMap = np.select(chargesCriteria, values, 0)
insurance['chargesMap'] = chargesMap

chargeLabels = ['low', 'medium', 'high']
chargeValues = insurance[(insurance['chargesMap'] == 2)].count()
chargeValues

In [15]:
#creating a pie chart to combine bmi and smoker to see which charge bracket they are in 
#labels are smoker first then bmi

#First i am creating a pie chart for high charges

yesHigh = insurance['smoker'][(insurance['smoker'] == 'yes') & (insurance['bmiMap'] == 2) & (insurance['chargesMap'] == 2)].count()
yesMed = insurance['smoker'][(insurance['smoker'] == 'yes') & (insurance['bmiMap'] == 1) & (insurance['chargesMap'] == 2)].count()
yesLow = insurance['smoker'][(insurance['smoker'] == 'yes') & (insurance['bmiMap'] == 0) & (insurance['chargesMap'] == 2)].count()

noHigh = insurance['smoker'][(insurance['smoker'] == 'no') & (insurance['bmiMap'] == 2) & (insurance['chargesMap'] == 2)].count()
noMed = insurance['smoker'][(insurance['smoker'] == 'no') & (insurance['bmiMap'] == 1) & (insurance['chargesMap'] == 2)].count()
noLow = insurance['smoker'][(insurance['smoker'] == 'no') & (insurance['bmiMap'] == 0) & (insurance['chargesMap'] == 2)].count()

label = ['yesHigh', 'yesMed', 'yesLow', 'noHigh',
         'noMed', 'noLow']

data = [yesHigh, yesMed, yesLow, noHigh, noMed, noLow]

# Creating plot
fig = plt.figure(figsize =(10, 7))
plt.pie(data, labels = label)
  
# show plot
plt.show()


In [16]:
#Medium Charges

yesHigh = insurance['smoker'][(insurance['smoker'] == 'yes') & (insurance['bmiMap'] == 2) & (insurance['chargesMap'] == 1)].count()
yesMed = insurance['smoker'][(insurance['smoker'] == 'yes') & (insurance['bmiMap'] == 1) & (insurance['chargesMap'] == 1)].count()
yesLow = insurance['smoker'][(insurance['smoker'] == 'yes') & (insurance['bmiMap'] == 0) & (insurance['chargesMap'] == 1)].count()

noHigh = insurance['smoker'][(insurance['smoker'] == 'no') & (insurance['bmiMap'] == 2) & (insurance['chargesMap'] == 1)].count()
noMed = insurance['smoker'][(insurance['smoker'] == 'no') & (insurance['bmiMap'] == 1) & (insurance['chargesMap'] == 1)].count()
noLow = insurance['smoker'][(insurance['smoker'] == 'no') & (insurance['bmiMap'] == 0) & (insurance['chargesMap'] == 1)].count()

label = ['yesHigh', 'yesMed', 'yesLow', 'noHigh',
         'noMed', 'noLow']

data = [yesHigh, yesMed, yesLow, noHigh, noMed, noLow]

# Creating plot
fig = plt.figure(figsize =(10, 7))
plt.pie(data, labels = label)
  
# show plot
plt.show()

In [17]:
#Low charges

yesHigh = insurance['smoker'][(insurance['smoker'] == 'yes') & (insurance['bmiMap'] == 2) & (insurance['chargesMap'] == 0)].count()
yesMed = insurance['smoker'][(insurance['smoker'] == 'yes') & (insurance['bmiMap'] == 1) & (insurance['chargesMap'] == 0)].count()
yesLow = insurance['smoker'][(insurance['smoker'] == 'yes') & (insurance['bmiMap'] == 0) & (insurance['chargesMap'] == 0)].count()

noHigh = insurance['smoker'][(insurance['smoker'] == 'no') & (insurance['bmiMap'] == 2) & (insurance['chargesMap'] == 0)].count()
noMed = insurance['smoker'][(insurance['smoker'] == 'no') & (insurance['bmiMap'] == 1) & (insurance['chargesMap'] == 0)].count()
noLow = insurance['smoker'][(insurance['smoker'] == 'no') & (insurance['bmiMap'] == 0) & (insurance['chargesMap'] == 0)].count()

label = ['yesHigh', 'yesMed', 'yesLow', 'noHigh',
         'noMed', 'noLow']

data = [yesHigh, yesMed, yesLow, noHigh, noMed, noLow]

# Creating plot
fig = plt.figure(figsize =(10, 7))
plt.pie(data, labels = label)
  
# show plot
plt.show()

In [18]:
# checking to see how much of the dataset has high bmi
bmiHigh = insurance['smoker'][(insurance['bmiMap'] == 2)].count()
bmiHigh

In [19]:
#checking if region has any affect on charges.

insurance['region'].unique()

regions = insurance['region']
regions.unique()

regionColours = {'southwest':'blue', 'southeast':'green', 'northwest':'red', 'northeast': 'yellow'}

plt.figure(figsize=(10,10))
plt.scatter(age, charges, c=insurance['region'].map(regionColours))
plt.show()



In [20]:
#Testing if how many children a person has affects charges

insurance.head()

children = insurance['children']
children.unique()

childrenColours = {0:'blue', 1:'green', 2:'red', 3: 'yellow', 4:'green', 5:'purple'}

plt.figure(figsize=(10,10))

plt.scatter(age, charges, c=insurance['children'].map(childrenColours))
plt.show()





In [21]:
#checks to see if there are any null values
insurance.info()
insurance.isnull().values.any()


In [22]:
#Correlation Analysis

#First on bmi and charges
#Covariance
from numpy import cov
# calculate covariance matrix
bmi = insurance['bmi']
charges = insurance['charges']

covariance = cov(bmi, charges)
print(covariance)

In [23]:
from scipy.stats import pearsonr
# calculate Pearson's correlation
corr, _ = pearsonr(charges, bmi)
print('Pearsons correlation: %.3f' % corr)

In [24]:
from scipy.stats import spearmanr
# calculate spearman's correlation
corr, _ = spearmanr(charges, bmi)
print('Spearmans correlation: %.3f' % corr)

In [25]:
# Now correlation on age and charges

age = insurance['age']
covariance = cov(age, charges)
print(covariance)


In [26]:
from scipy.stats import pearsonr
# calculate Pearson's correlation
corr, _ = pearsonr(age, charges)
print('Pearsons correlation: %.3f' % corr)

In [27]:
from scipy.stats import spearmanr
# calculate spearman's correlation
corr, _ = spearmanr(age, charges)
print('Spearmans correlation: %.3f' % corr)

In [28]:
#Correlation on smoking 
smoker = insurance.smoker.map(dict(yes=1, no=0))

covariance = cov(smoker, charges)
print(covariance)

In [29]:
from scipy.stats import pearsonr
# calculate Pearson's correlation
corr, _ = pearsonr(charges, smoker)
print('Pearsons correlation: %.3f' % corr)

In [30]:
from scipy.stats import spearmanr
# calculate spearman's correlation
corr, _ = spearmanr(charges, smoker)
print('Spearmans correlation: %.3f' % corr)

In [31]:
#Correlation on How many children someone has
children = insurance['children']

covariance = cov(children, charges)
print(covariance)

In [32]:
from scipy.stats import pearsonr
# calculate Pearson's correlation
corr, _ = pearsonr(children, charges)
print('Pearsons correlation: %.3f' % corr)

In [33]:
from scipy.stats import spearmanr
# calculate spearman's correlation
corr, _ = spearmanr(children, charges)
print('Spearmans correlation: %.3f' % corr)

In [34]:
#Correlation for gender and charges
sex = insurance.sex.map(dict(female=1, male=0))

covariance = cov(sex, charges)
print(covariance)

In [35]:
from scipy.stats import pearsonr
# calculate Pearson's correlation
corr, _ = pearsonr(sex, charges)
print('Pearsons correlation: %.3f' % corr)

In [36]:
from scipy.stats import spearmanr
# calculate spearman's correlation
corr, _ = spearmanr(sex, charges)
print('Spearmans correlation: %.3f' % corr)

In [37]:
#Feature Selection and Model Evaluation

#Cleaning the data and putting it into a new dataframe
insurance.head()
insurance1 = insurance.drop("bmiMap", 1)
insurance1 = insurance1.drop("charges", 1)
insurance1.rename(columns = {"chargesMap": "target"}, inplace="true")
insurance1.head()


In [38]:
"""
Scaling and transfer
"""
from sklearn. impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector,make_column_transformer
from sklearn.pipeline import make_pipeline
imp_median = SimpleImputer(strategy='median', add_indicator=True)
scaler = StandardScaler()

# set up preprocessing numeric columns
imp_median = SimpleImputer(strategy='median', add_indicator=True)
scaler = StandardScaler()

# set up preprocessing categorical columns
imp_constant = SimpleImputer(strategy='constant')
ohe = OneHotEncoder(handle_unknown='ignore')

# select columns by datatype
num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_exclude='number')

# do all preprocessing
preprocessor = make_column_transformer(
    (make_pipeline(imp_median, scaler), num_cols),
    (make_pipeline(imp_constant, ohe), cat_cols)
)

In [39]:
#splits the dataset into testing and training
def getXy(dataset,targetname):
    X = dataset.drop(targetname, axis=1)
    y = dataset[targetname].astype(int)
    return X, y

In [40]:
#definine feature selection pipeline
def feature_selection_pipeline_model(X,y,mlmodel):
    models = dict()
    for i in range(2, len(X.columns) + 1):
        rfe = RFE(estimator=mlmodel, n_features_to_select=i)
        pipeline = make_pipeline(preprocessor,rfe)

        models[str(i)] = pipeline
    return models

In [41]:
# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

from numpy import mean
from numpy import std

MLmodel = LogisticRegression()
X, y = getXy(insurance1,'target')
models = feature_selection_pipeline_model(X,y,MLmodel)
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('>%s mean: %.3f, max: %.3f (%.3f)' % (name, mean(scores), max(scores), std(scores)))

In [43]:
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [44]:
MLmodel = Perceptron()
X, y = getXy(insurance1,'target')
models = feature_selection_pipeline_model(X,y,MLmodel)
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('>%s mean: %.3f, max: %.3f (%.3f)' % (name, mean(scores), max(scores), std(scores)))

In [45]:
MLmodel = DecisionTreeClassifier()
X, y = getXy(insurance1,'target')
models = feature_selection_pipeline_model(X,y,MLmodel)
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('>%s mean: %.3f, max: %.3f (%.3f)' % (name, mean(scores), max(scores), std(scores)))

In [46]:
MLmodel = RandomForestClassifier()
X, y = getXy(insurance1,'target')
models = feature_selection_pipeline_model(X,y,MLmodel)
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('>%s mean: %.3f, max: %.3f (%.3f)' % (name, mean(scores), max(scores), std(scores)))

In [47]:
MLmodel = GradientBoostingClassifier()
X, y = getXy(insurance1,'target')
models = feature_selection_pipeline_model(X,y,MLmodel)
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('>%s mean: %.3f, max: %.3f (%.3f)' % (name, mean(scores), max(scores), std(scores)))

In [48]:
insurance1.head()

In [49]:
sexOHE = pd.DataFrame(pd.get_dummies(insurance1['sex']))
df = insurance1.join(sexOHE)

df.isnull().values.any()
df


In [50]:
smokerOHE = pd.DataFrame(pd.get_dummies(df['smoker']))
df = df.join(smokerOHE)

df.isnull().values.any()
df.rename(columns = {"no": "non-smoker", "yes":"is-smoker"}, inplace="true")
df

In [51]:
smokerOHE = pd.DataFrame(pd.get_dummies(df['region']))
df = df.join(smokerOHE)
df.isnull().values.any()
df


In [52]:
df

In [53]:
df.drop(['sex', 'smoker', 'region'], axis=1, inplace=True)
df.head()

In [54]:
from sklearn.model_selection import train_test_split

In [55]:
df.head()

x = df.drop('target',axis=1)
y = df['target']

In [56]:
df.head()

In [57]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

In [58]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
dtree.fit(x_train,y_train)

In [59]:
#Predictions
from sklearn.metrics import classification_report,confusion_matrix

predictions = dtree.predict(x_test)

print(classification_report(y_test,predictions))

In [60]:
print(confusion_matrix(y_test,predictions))

In [61]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(x_train, y_train)

In [62]:
rfc_pred = rfc.predict(x_test)

In [63]:
print(confusion_matrix(y_test,rfc_pred))

In [64]:
print(classification_report(y_test,rfc_pred))

In [65]:
#Logistical Regression
df.columns

In [67]:
x = df[['age', 'bmi', 'children', 'female', 'male', 'non-smoker',
       'is-smoker', 'northeast', 'northwest', 'southeast', 'southwest']]
y = df['target']

In [130]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [131]:
from sklearn.linear_model import LogisticRegression

In [132]:
log = LogisticRegression()

In [133]:
log.fit(x_train,y_train)

In [134]:
# print the intercept
print(log.intercept_)

In [135]:
y_pred = log.predict(x_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(log.score(x_test, y_test)))

In [136]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [137]:
print(classification_report(y_test, y_pred))

In [151]:
#Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

In [153]:
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train,  
    test_size=0.3, random_state=12)

In [173]:
grad = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=0)
grad.fit(x_train, y_train)
preds = grad.predict(x_test)

In [174]:
from sklearn.metrics import confusion_matrix
print("Confusion Matrix:")
confusion_matrix = confusion_matrix(y_test, predictions)
print(confusion_matrix)




In [175]:
print("Classification Report")
print(classification_report(y_test, predictions))