# U.S. Medical Insurance Costs

# Look over the dataset

In [113]:
import pandas as pd

file_path = '/Users/leocabart/code/lcabart21/CA_USMedicalInsuranceCost/insurance.csv'

data = pd.read_csv(file_path)
df = pd.DataFrame(data)

In [114]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [115]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


___________________________________________________________

# Scoping the project

In [116]:
#Général
print("Moyenne d'age :", df['age'].mean())
print("Moyenne BMI :", df['bmi'].mean())

Moyenne d'age : 39.20702541106129
Moyenne BMI : 30.66339686098655


In [117]:
#Units average by sex
df.groupby(['sex']).mean(['age', 'bmi', 'children', 'charges'])

Unnamed: 0_level_0,age,bmi,children,charges
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,39.503021,30.377749,1.074018,12569.578844
male,38.91716,30.943129,1.115385,13956.751178


In [118]:
#Units average by region
df.groupby(['region']).mean(['age', 'bmi', 'children', 'charges'])

Unnamed: 0_level_0,age,bmi,children,charges
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
northeast,39.268519,29.173503,1.046296,13406.384516
northwest,39.196923,29.199785,1.147692,12417.575374
southeast,38.93956,33.355989,1.049451,14735.411438
southwest,39.455385,30.596615,1.141538,12346.937377


In [119]:
#Units average by individual smoking habits
df.groupby(['smoker']).mean(['age', 'bmi', 'children', 'charges'])

Unnamed: 0_level_0,age,bmi,children,charges
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,39.385338,30.651795,1.090226,8434.268298
yes,38.514599,30.708449,1.113139,32050.231832


In [120]:
#Units average by smoking habits & region
df.groupby(['smoker', 'region']).mean(['age', 'bmi', 'children', 'charges'])

Unnamed: 0_level_0,Unnamed: 1_level_0,age,bmi,children,charges
smoker,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
no,northeast,39.536965,29.332082,1.046693,9165.531672
no,northwest,39.168539,29.212678,1.11985,8556.463715
no,southeast,38.673993,33.442418,1.065934,8032.216309
no,southwest,40.183521,30.507865,1.127341,8019.284513
yes,northeast,38.238806,28.565224,1.044776,29673.536473
yes,northwest,39.327586,29.140431,1.275862,30192.003182
yes,southeast,39.736264,33.096703,1.0,34844.996824
yes,southwest,36.103448,31.005172,1.206897,32269.063494


In [121]:
#Number of individuals by region
df.value_counts(['region'])

region   
southeast    364
northwest    325
southwest    325
northeast    324
Name: count, dtype: int64

In [122]:
#Average age of individuals having children
child = df[df['children'] > 0]

print("Moyenne d'âge des individus avec au moins 1 enfant :", child['age'].mean())

Moyenne d'âge des individus avec au moins 1 enfant : 39.78010471204188


___________________________________________________________

# Analysis

In [146]:
import plotly.express as px
import plotly.graph_objects as go

In [161]:
#General comparaison by sex
genre = df.groupby(['sex']).mean(['age', 'bmi', 'children', 'charges']).reset_index()
genre

Unnamed: 0,sex,age,bmi,children,charges
0,female,39.503021,30.377749,1.074018,12569.578844
1,male,38.91716,30.943129,1.115385,13956.751178


# BMI

In [162]:
#BMI by sex
bmi_sex = df.groupby('sex')['bmi'].mean().reset_index()
bmi_sex

Unnamed: 0,sex,bmi
0,female,30.377749
1,male,30.943129


In [211]:
#Average BMI by age
bmi_age = df.groupby('age')['bmi'].mean().reset_index()

fig_bmi_age = px.line(bmi_age, x= 'age', y= 'bmi', title= 'Average BMI by age', markers= True)
fig_bmi_age

In [225]:
df['smoker_count'] = df['smoker'].apply(lambda x: 1 if x == 'yes' else 0)

int_smoker_count = df[['age', 'bmi', 'smoker_count']]
int_smoker_count

Unnamed: 0,age,bmi,smoker_count
0,19,27.900,1
1,18,33.770,0
2,28,33.000,0
3,33,22.705,0
4,32,28.880,0
...,...,...,...
1333,50,30.970,0
1334,18,31.920,0
1335,18,36.850,0
1336,21,25.800,0


In [229]:
#Compare BMI with smoking habits
bmi_smoker = int_smoker_count.groupby(['age', 'smoker_count'])[['bmi']].mean().reset_index()
bmi_smoker

Unnamed: 0,age,smoker_count,bmi
0,18,0,31.701228
1,18,1,29.544583
2,19,0,27.653500
3,19,1,31.217500
4,20,0,31.591750
...,...,...,...
89,62,1,28.968750
90,63,0,31.918333
91,63,1,31.942000
92,64,0,34.393333


In [232]:
fig_bmi_smoker = px.line(bmi_smoker, x='age', y='bmi', color='smoker_count', title='BMI by Age and Smoking Habit (O for non-smokers & 1 for smokers)', markers= True)

fig_bmi_smoker.show()

In [235]:
#Impact of having children on BMI
bmi_children = df.groupby('children')['bmi'].mean().reset_index()

fig_bmi_children = px.line(bmi_children, x= 'children', y= 'bmi', title= 'Average BMI by number of children', markers= True)
fig_bmi_children

In [247]:
#BMI by region
bmi_region = df.groupby('region')['bmi'].mean().reset_index()

fig_bmi_region = px.bar(bmi_region, x= 'region', y= 'bmi', text_auto= '.4s', title= 'Average BMI by region')
fig_bmi_region.update_layout(xaxis={'categoryorder': 'total descending'})
fig_bmi_region.show()

# Charges

In [None]:
#Charges with smoking habits

In [None]:
#Charges by age

In [None]:
#Charges with/out children

In [None]:
#Charges by region

# Region

In [None]:
#Age average by region

In [None]:
#Children by region

In [None]:
#Smoking habits by region