# U.S. Medical Insurance Costs

## Importing Data / Libraries

In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("insurance.csv")

print(data.head())

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


## Variables

In [2]:
age = data['age']
sex = data['sex']
bmi = data['bmi']
children = data['children']
smoker = data['smoker']
region = data['region']
charges = data['charges']

## Exploratory Data Analysis

### Average Age

In [3]:
# Calculating the overall average age
def average_age(ages):
    total_age = len(age)
    total_ages = sum(age)
    return ("Average Patient Age: " + str(round(total_ages/total_age, 2)) + " years")

avg_age = average_age(age)
print(avg_age)

Average Patient Age: 39.21 years


### Average Age For A Person With At Least 1 Child

In [4]:
one_child = data[data['children'] >= 1]
print(one_child.head())

one_child_age = one_child['age']
type(one_child_age)
one_child_age

counter = one_child_age.sum()
def avg_age_one_child(data):
    counter = data.sum()
    length = len(data)
    return ("Average Patient Age with one child: " + str(round(counter/length, 2)) + " years")

average_age_one_child = avg_age_one_child(one_child_age)
average_age_one_child

   age     sex    bmi  children smoker     region    charges
1   18    male  33.77         1     no  southeast  1725.5523
2   28    male  33.00         3     no  southeast  4449.4620
6   46  female  33.44         1     no  southeast  8240.5896
7   37  female  27.74         3     no  northwest  7281.5056
8   37    male  29.83         2     no  northeast  6406.4107


'Average Patient Age with one child: 39.78 years'

### Region Count

In [5]:
# See how many regions are in this data 
def unique_regions(regions):
    unique_regions = []
    for region in regions:
        if region not in unique_regions:
            unique_regions.append(region)
    return unique_regions

region_counter = unique_regions(region)
print(region_counter)

['southwest', 'southeast', 'northwest', 'northeast']


In [6]:
# Let's count which region has the most data
data['region'].value_counts(sort=False)

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

### Difference Cost's Smoker vs. Non-Smoker

In [7]:
smoker = data[data['smoker'] == 'yes']
print(smoker)

non_smoker = data[data['smoker'] == 'no']
print(non_smoker)

      age     sex     bmi  children smoker     region      charges
0      19  female  27.900         0    yes  southwest  16884.92400
11     62  female  26.290         0    yes  southeast  27808.72510
14     27    male  42.130         0    yes  southeast  39611.75770
19     30    male  35.300         0    yes  southwest  36837.46700
23     34  female  31.920         1    yes  northeast  37701.87680
...   ...     ...     ...       ...    ...        ...          ...
1313   19  female  34.700         2    yes  southwest  36397.57600
1314   30  female  23.655         3    yes  northwest  18765.87545
1321   62    male  26.695         0    yes  northeast  28101.33305
1323   42  female  40.370         2    yes  southeast  43896.37630
1337   61  female  29.070         0    yes  northwest  29141.36030

[274 rows x 7 columns]
      age     sex     bmi  children smoker     region      charges
1      18    male  33.770         1     no  southeast   1725.55230
2      28    male  33.000         3   

In [8]:
smoker_cost = smoker['charges']

def avg_smoker_cost(data):
    counter = data.sum()
    length = len(data)
    return round(counter/length, 2)

average_smoker_cost = avg_smoker_cost(smoker_cost)
print("Average Smoker Cost: " + str(average_smoker_cost) + " dollars")

Average Smoker Cost: 32050.23 dollars


In [9]:
non_smoker_cost = non_smoker['charges']

def avg_nosmoke_cost(data):
    counter = data.sum()
    length = len(data)
    return round(counter/length, 2)

average_nosmoke_cost = avg_nosmoke_cost(non_smoker_cost)
print("Average No-Smoker Cost: " + str(average_nosmoke_cost) + " dollars")

Average No-Smoker Cost: 8434.27 dollars


In [10]:
difference = 32050.23 - 8434.27
print("An average smoker pays: " + str(difference) + " dollars more in insurance costs.")

An average smoker pays: 23615.96 dollars more in insurance costs.


## Summary

The average person in our data is about 39.21 years old.

The average person with at least one child is 39.78 years old. It's only 0.57 years older than persons without children. I, personally, thought, that the average age of a person with a child would be higher.

In our data we have 4 regions: Northwest, Northeast, Southwest, Southeast

The Southeast region has with 364 people the majority in our data.

In our data we have 274 smokers, and 1'064 non-smokers. An average smoker pays about 32'050.23 dollars for his/her insurance. A non-smoker pays an average of 8'434.27 dollars. That's a difference of 23'615.96 dollars!!!