In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
init_notebook_mode(connected=True)
import plotly.graph_objects as go

In [2]:
df = pd.read_csv('data/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
df.describe().T.round(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1338.0,39.21,14.05,18.0,27.0,39.0,51.0,64.0
bmi,1338.0,30.66,6.1,15.96,26.3,30.4,34.69,53.13
children,1338.0,1.09,1.21,0.0,0.0,1.0,2.0,5.0
charges,1338.0,13270.42,12110.01,1121.87,4740.29,9382.03,16639.91,63770.43


In [5]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [6]:
df.duplicated().sum()
df[df.duplicated(keep=False)]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
195,19,male,30.59,0,no,northwest,1639.5631
581,19,male,30.59,0,no,northwest,1639.5631


In [7]:
df = df.drop_duplicates()
df.duplicated().sum()

0

In [8]:
# plt.style.use('seaborn-v0_8-darkgrid')
# plt.rcParams['figure.figsize'] = [17,6]

In [9]:
# tabelka ze brak nuli

In [10]:
corr = df.corr()

In [11]:
fig = px.imshow(corr,
                labels=dict(x="Confusion Matrix", color="Correlation"),
                x=['Age', 'BMI', 'Children', 'Charges'],
                y=['Charges', 'Children', 'BMI', 'Age'],
                color_continuous_scale='RdBu_r'
               )
fig.update_xaxes(side="top")
fig.show()

In [26]:
fig = px.histogram(df, x='charges', title="Distribution of charges", labels={
                     "charges": "Charges",
                     "count": "Count"
                 },)
fig.show()

In [28]:
fig = px.histogram(df, x="charges", color="sex",   marginal="box", title="Distribution of charges for smokers and non-smokers", labels={
                     "charges": "Charges",
                     "count": "Count",
                    "sex":"Sex"})
fig.show()

In [30]:
px.histogram(df, x="sex", title="Distribution of sex", labels={
                     "count": "Count",
                    "sex":"Sex"})

In [32]:
px.histogram(df, x="smoker", title="Distribution of smoker",  labels={
                     "count": "Count",
                    "smoker":"Smoker"})

In [34]:
children_cat = df.groupby(by=["children"]).size().reset_index(name="counts")
px.bar(data_frame=children_cat, x="children", y="counts", title="Distribution",
      labels={
                     "count": "Count",
                    "children":"How may children"})

In [35]:
region_cat = df.groupby(by=["region"]).size().reset_index(name="counts")
px.bar(data_frame=region_cat,x="region",y="counts", title="Distribution of region",
      labels={
                     "count": "Count",
                    "region":"Region"})

In [37]:
fig = px.histogram(df, x="charges", color="children",   marginal="box", title="Distribution of charges v childern",
      labels={
                     "count": "Count",
                    "charges":"Charges",
                    'children':'Children'          })
fig.show()

In [38]:
fig = px.histogram(df, x="charges", color='smoker',   marginal="box", title="Distribution of charges vs smoker",
      labels={
                     "count": "Count",
                    "charges":"Charges",
                    'smoker':'Smoker'      
      } )
fig.show()

In [39]:
fig = px.histogram(df, x="charges", color='region',   marginal="box", title="Distribution of charges vs region",
                    labels={ "count": "Count",
                        "charges":"Charges",
                        'region':'Region'      
      })
fig.show()

In [43]:
fig = px.scatter(df, x='bmi', y='charges', facet_col="smoker", color="sex", trendline="ols", title="Scatter plot of smoker, BMI and sex",
                    labels={ "bmi": "BMI",
                        "charges":"Charges",
                        'sex':'Sex'
      })
fig.show()

In [44]:
fig = px.scatter(df, x='age', y='charges', facet_col="smoker", color="sex", trendline="ols",
                title="Scatter plot of smoker, age and sex",
                    labels={ "age": "Age",
                        "charges":"Charges",
                        'sex':'Sex'
      })
fig.show()

In [45]:
fig = px.scatter(df, x='age', y='bmi', facet_col="smoker", color="sex", trendline="ols",
                title="Scatter plot of smoker, BMI and age",
                    labels={ "bmi": "BMI",
                        "age":"Age",
                        'sex':'Sex'
      })
fig.show()

In [46]:
fig = px.histogram(df, x="smoker", color='sex', barmode='group', title = 'Distrybition of sex and smoker',
                  labels={ "smoker": "Smoker",
                        "count":"Count",
                        'sex':'Sex'
      })
fig.show()

In [47]:
fig = px.violin(df, x='sex', y='charges', box=True, color='smoker', title = 'Violin plot of charges vs sex', 
               labels={ "smoker": "Smoker",
                        "charges":"Charges",
                        'sex':'Sex'
      })
fig.show()

In [48]:
fig = px.violin(df, x='region', y='charges', box=True, color='sex', title = 'Violin plot of charges vs sex and region', 
               labels={ "sex": "Sex",
                        "charges":"Charges",
                        'region':'Region'
      })
fig.show()

In [49]:
fig = px.box(df,  x="charges", y = "smoker", color = 'sex', title = 'Box plot of charges vs sex and smoker', 
               labels={ "sex": "Sex",
                        "charges":"Charges",
                        'smoker':'Smoker'
      })
fig.show()

In [53]:
hist_data = [df['age']]
group_labels = ['distplot'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels)
fig.show()

In [44]:
hist_data = [df['bmi']]
group_labels = ['distplot'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels)
fig.show()

In [54]:
fig = px.strip(df, x='smoker', y='charges', color='sex', stripmode= 'overlay',title = 'Relationship between Smokers and Charges', 
               labels={ "sex": "Sex",
                        "charges":"Charges",
                        'smoker':'Smoker'
      })
fig.show()

In [55]:
fig = px.strip(df, x='sex', y='charges', color='smoker', stripmode= 'overlay',title = 'Relationship between Sex and Charges', 
               labels={ "sex": "Sex",
                        "charges":"Charges",
                        'smoker':'Smoker'
      })
fig.show()

In [56]:
fig = px.strip(df, x='region', y='charges', color='smoker', stripmode= 'overlay', title = 'Relationship between Region and Charges', 
               labels={ "smoker": "Smoker",
                        "charges":"Charges",
                        'region':'Region'
      })
fig.show()