In [1]:
# Importing of important libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
# Finding location of file
os.getcwd()
os.chdir('/Users/jmunizbecerra/Desktop')

In [5]:
# Loading in insurance data and creating dataframe
df = pd.read_csv('insurance.csv')

In [7]:
# Analyzing size of dataframe
df.shape

(1338, 7)

In [8]:
# Checking for any missing values
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [9]:
# Getting a glimpse at data's columns
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [24]:
# Importing of labelencoder to convert categorical data into numerical
# data for analysis
from sklearn.preprocessing import LabelEncoder

In [25]:
# Creation of LabelEncoder objects for categorical data
le_sex = LabelEncoder()
le_smoker = LabelEncoder()
le_region = LabelEncoder()

In [26]:
# Creation of a new dataframe to conserve old dataframe
new_df = df

In [27]:
# Addition of transformed categorical data into dataframe
new_df['sex_n'] = le_sex.fit_transform(new_df['sex'])
new_df['smoker_n'] = le_smoker.fit_transform(new_df['smoker'])
new_df['region_n'] = le_region.fit_transform(new_df['region'])

In [28]:
new_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,sex_n,smoker_n,region_n
0,19,female,27.900,0,yes,southwest,16884.92400,0,1,3
1,18,male,33.770,1,no,southeast,1725.55230,1,0,2
2,28,male,33.000,3,no,southeast,4449.46200,1,0,2
3,33,male,22.705,0,no,northwest,21984.47061,1,0,1
4,32,male,28.880,0,no,northwest,3866.85520,1,0,1
...,...,...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,1,0,1
1334,18,female,31.920,0,no,northeast,2205.98080,0,0,0
1335,18,female,36.850,0,no,southeast,1629.83350,0,0,2
1336,21,female,25.800,0,no,southwest,2007.94500,0,0,3


In [None]:
# Smoking: Yes = 1, No = 0
# Sex: Male = 1, Female = 0
# Region: Southwest = 3, Southeast = 2, Northwest = 1, Northeast = 0

In [35]:
# Removal of categorical data columns
df_final = new_df.drop(columns=['sex', 'smoker', 'region'])

Unnamed: 0,age,bmi,children,charges,sex_n,smoker_n,region_n
0,19,27.900,0,16884.92400,0,1,3
1,18,33.770,1,1725.55230,1,0,2
2,28,33.000,3,4449.46200,1,0,2
3,33,22.705,0,21984.47061,1,0,1
4,32,28.880,0,3866.85520,1,0,1
...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0,1
1334,18,31.920,0,2205.98080,0,0,0
1335,18,36.850,0,1629.83350,0,0,2
1336,21,25.800,0,2007.94500,0,0,3


# Understanding What Our Data Consists Of 

In [10]:
# Breakdown of sex into groups (males, females)
gender = df.groupby('sex')

In [13]:
# Getting averages for each group
gender.mean()

Unnamed: 0_level_0,age,bmi,children,charges
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,39.503021,30.377749,1.074018,12569.578844
male,38.91716,30.943129,1.115385,13956.751178


In [14]:
# Number of males
len(df.loc[df['sex'] == 'male'])

676

In [15]:
# Number of females
len(df.loc[df['sex'] == 'female'])

662

In [17]:
# Grouping data by region
regions = df.groupby('region')

In [19]:
# Obtaining means based off regions
regions.mean()

Unnamed: 0_level_0,age,bmi,children,charges
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
northeast,39.268519,29.173503,1.046296,13406.384516
northwest,39.196923,29.199785,1.147692,12417.575374
southeast,38.93956,33.355989,1.049451,14735.411438
southwest,39.455385,30.596615,1.141538,12346.937377


In [36]:
# Obtaining max values based off regions
regions.max()

Unnamed: 0_level_0,age,sex,bmi,children,smoker,charges
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
northeast,64,male,48.07,5,yes,58571.07448
northwest,64,male,42.94,5,yes,60021.39897
southeast,64,male,53.13,5,yes,63770.42801
southwest,64,male,47.6,5,yes,52590.82939


In [37]:
# Obtaining min values based off regions
regions.min()

Unnamed: 0_level_0,age,sex,bmi,children,smoker,charges
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
northeast,18,female,15.96,0,no,1694.7964
northwest,19,female,17.385,0,no,1621.3402
southeast,18,female,19.8,0,no,1121.8739
southwest,19,female,17.4,0,no,1241.565


In [20]:
# Number of clients from NORTHEAST region
len(df.loc[df['region'] == 'northeast'])

324

In [21]:
# Number of clients from NORTHWEST region
len(df.loc[df['region'] == 'northwest'])

325

In [22]:
# Number of clients from SOUTHEAST region
len(df.loc[df['region'] == 'southeast'])

364

In [23]:
# Number of clients from SOUTHWEST region
len(df.loc[df['region'] == 'southwest'])

325

# Multi-Variate Regression Analysis

In [38]:
x = df_final
y = df.charges

In [92]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=30)

In [93]:
from sklearn import linear_model

model = linear_model.LinearRegression()
model.fit(x[['age', 'bmi', 'region_n', 'smoker_n', 'children', 'sex_n']], x.charges)

LinearRegression()

In [99]:
model.predict([[35, 27, 3, 0, 1, 1]])



array([5456.36263359])

In [95]:
x_test

Unnamed: 0,age,bmi,children,charges,sex_n,smoker_n,region_n
338,50,32.300,1,41919.09700,1,1,0
620,30,31.400,1,3659.34600,1,0,3
965,35,27.100,1,4746.34400,1,0,3
128,32,17.765,2,32734.18630,0,1,1
329,52,36.700,0,9144.56500,1,0,3
...,...,...,...,...,...,...,...
580,59,25.460,1,12913.99240,1,0,0
786,60,36.955,0,12741.16745,1,0,0
321,26,29.640,4,24671.66334,0,0,0
903,49,36.850,0,8125.78450,1,0,2


In [96]:
model.score(x[['age', 'bmi', 'region_n', 'smoker_n', 'children', 'sex_n']], x.charges)

0.7507372027994939