# U.S. Medical Insurance Costs

In this project, we will be using Python fundamentals to explore a CSV file containing medical insurance costs. The objective is to analyze different attributes in the "insurance.csv" file, extract information about the patients, and discover potential applications for the dataset.

By examining this data, we aim to gain insights and better understand the various aspects of insurance costs and patient characteristics.

In [71]:
import csv
import numpy
import statistics

In [72]:
ages = []
sexes = []
bmis = []
num_children = []
smoker = []
regions = []
costs = []

In [73]:
def populate_list(list, source, column):
    with open(source, newline='') as file:
        csvreader = csv.DictReader(file)
        for row in csvreader:
            list.append(row[column])
        return list

Using the helper function populate_list() we can load up our lists with data.

In [74]:
populate_list(ages, 'insurance.csv', 'age')
populate_list(sexes, 'insurance.csv', 'sex')
populate_list(bmis, 'insurance.csv', 'bmi')
populate_list(num_children, 'insurance.csv', 'children')
populate_list(smoker, 'insurance.csv', 'smoker')
populate_list(regions, 'insurance.csv', 'region')
populate_list(costs, 'insurance.csv', 'charges')

['16884.924',
 '1725.5523',
 '4449.462',
 '21984.47061',
 '3866.8552',
 '3756.6216',
 '8240.5896',
 '7281.5056',
 '6406.4107',
 '28923.13692',
 '2721.3208',
 '27808.7251',
 '1826.843',
 '11090.7178',
 '39611.7577',
 '1837.237',
 '10797.3362',
 '2395.17155',
 '10602.385',
 '36837.467',
 '13228.84695',
 '4149.736',
 '1137.011',
 '37701.8768',
 '6203.90175',
 '14001.1338',
 '14451.83515',
 '12268.63225',
 '2775.19215',
 '38711',
 '35585.576',
 '2198.18985',
 '4687.797',
 '13770.0979',
 '51194.55914',
 '1625.43375',
 '15612.19335',
 '2302.3',
 '39774.2763',
 '48173.361',
 '3046.062',
 '4949.7587',
 '6272.4772',
 '6313.759',
 '6079.6715',
 '20630.28351',
 '3393.35635',
 '3556.9223',
 '12629.8967',
 '38709.176',
 '2211.13075',
 '3579.8287',
 '23568.272',
 '37742.5757',
 '8059.6791',
 '47496.49445',
 '13607.36875',
 '34303.1672',
 '23244.7902',
 '5989.52365',
 '8606.2174',
 '4504.6624',
 '30166.61817',
 '4133.64165',
 '14711.7438',
 '1743.214',
 '14235.072',
 '6389.37785',
 '5920.1041',
 '176

Next we will want to consider if cleaning data and reformatting is necessary. It looks like rounding BMI to one decimal and rounding costs to two decimals makes sense.

In [75]:
# clean bmis list, converting strings to float and rounding nearest decimal
bmis = [round(float(num), 1) for num in bmis]
# clean costs list, converting strings to float and rounding nearest 2 decimals
costs= [round(float(num), 2) for num in costs]
# convert ages strings to ints
ages= [int(age) for age in ages]

Build a class to run initial analyses of data including average age, how many men and women, what region most patients are from, and costs related to smoker status.

In [76]:
class analyse_patient_data:
    def __init__(self, ages, sexes, bmis, num_children, smoker, regions, costs):
        self.ages = ages
        self.sexes = sexes
        self.bmis = bmis
        self.num_children = num_children
        self.smoker = smoker
        self.regions = regions
        self.costs = costs

    def average_age(self):
        total = 0
        for age in self.ages:
            total+=int(age)
        print(f"Average age of patients is {total // len(self.ages)} years")
        
    def total_by_sex(self):
        total_women = 0
        total_men = 0
        for sex in self.sexes:
            if sex == "female":
                total_women+=1
            else:
                total_men+=1
        print(f"Total records for women: {total_women}\nTotal records for men: {total_men}")
    
    def unique_regions(self):
        regions_list=[]
        for region in self.regions:
            if region not in regions_list:
                regions_list.append(region)
        regions_list.sort()
        return regions_list
    
    # Helper functions 
    def find_outliers(self, data, lower_fence, upper_fence):
        outliers = [value for value in data if value < lower_fence or value > upper_fence]
        return outliers

    def calculate_iqr(self, data):
        sorted_data = sorted(data)
        n = len(sorted_data)
        q1 = sorted_data[int(n * 0.25)]
        q3 = sorted_data[int(n * 0.75)]
        iqr = round(q3 - q1)
        return sorted_data, q1, q3, iqr
        
    # Find median, standard deviation, IQR and identify outliers
    def analyse_costs(self, data_dict):
        results = {}
        for bracket in data_dict:
            if len(data_dict[bracket]) > 0:
                costs = []
                for pair in data_dict[bracket]:
                    costs.append(pair[0])

                # get IQR and quartiles
                sorted_data, q1, q3, iqr = self.calculate_iqr(costs)

                # define the outlier criteria
                lower_fence = q1 - 1.5 * iqr
                upper_fence = q3 + 1.5 * iqr
                outliers = self.find_outliers(sorted_data, lower_fence, upper_fence)

                results[bracket] = {
                    'Mean': round(numpy.mean(costs), 1),
                    'Median': round(numpy.median(costs), 1),
                    'Standard Deviation': round(statistics.stdev(costs), 1),
                    'IQR': iqr,
                    'Outliers': outliers
                }
        return results
    
    # Returns dictionary containing all data from insurance.csv by column
    def create_dict(self):
        patient_dict = {}
        patient_dict['Ages'] = self.ages
        patient_dict['Sexes'] = self.sexes
        patient_dict['BMIs'] = self.bmis
        patient_dict['Number of Children'] = self.num_children
        patient_dict['Smoker Status'] = self.smoker
        patient_dict['Regions'] = self.regions
        patient_dict['Total Costs'] = self.costs
        return patient_dict

In [77]:
# Create instance of analyse_patient_data with data from insrance.csv
patients = analyse_patient_data(ages, sexes, bmis, num_children, smoker, regions, costs)
average_age = patients.average_age()
total_by_sex = patients.total_by_sex()
unique_regions = patients.unique_regions()
patients_dict = patients.create_dict()
print(unique_regions)

Average age of patients is 39 years
Total records for women: 662
Total records for men: 676
['northeast', 'northwest', 'southeast', 'southwest']


Sorting costs by different variables, starting with cost by region:

In [78]:
costs_by_bmi=list(zip(costs,bmis))
costs_by_age=list(zip(costs,ages))
costs_by_num_children=list(zip(costs,num_children))
costs_by_region=list(zip(regions,costs)) 

#Insurance costs sorted by region, I started with this because I suspected it might be particularly tricky.
regions_list=[]
for region in regions:
    if region not in regions_list:
        regions_list.append(region)
regions_list.sort()
print(regions_list)

costs_by_region_dict={region:[] for region in regions_list}

for patient in costs_by_region:
    for region in regions_list:
        if patient[0]==region:
            costs_by_region_dict[region].append(patient[1])

print(costs_by_region_dict)

#Could we identify regions which are the 'most expensive'?

['northeast', 'northwest', 'southeast', 'southwest']
{'northeast': [6406.41, 2721.32, 10797.34, 2395.17, 13228.85, 37701.88, 14451.84, 2198.19, 39774.28, 3046.06, 6079.67, 3393.36, 2211.13, 13607.37, 8606.22, 6799.46, 2755.02, 4441.21, 7935.29, 30184.94, 22412.65, 3645.09, 21344.85, 11488.32, 30260.0, 1705.62, 39556.49, 3385.4, 12815.44, 13616.36, 2457.21, 27375.9, 3490.55, 6334.34, 19964.75, 7077.19, 15518.18, 10407.09, 4827.9, 1694.8, 8538.29, 4005.42, 43753.34, 14901.52, 4337.74, 20984.09, 6610.11, 10564.88, 7358.18, 9225.26, 38511.63, 5354.07, 29523.17, 4040.56, 12829.46, 41097.16, 13047.33, 24869.84, 14590.63, 9282.48, 9617.66, 9715.84, 22331.57, 48549.18, 4237.13, 11879.1, 9432.93, 47896.79, 20277.81, 1704.57, 6746.74, 24873.38, 11944.59, 9722.77, 10435.07, 4667.61, 24671.66, 11566.3, 6600.21, 48517.56, 11658.38, 19144.58, 41919.1, 13217.09, 13981.85, 8334.46, 12404.88, 10043.25, 9778.35, 13430.26, 3481.87, 12029.29, 7639.42, 21659.93, 15006.58, 42303.69, 8302.54, 10736.87, 8964.

At least now we can find the average cost per region

In [79]:
def regional_average_cost(costs_by_region_dict,region):
    total_cost=0
    for cost in costs_by_region_dict[region]:
        total_cost+=float(cost)
    return round(total_cost/len(costs_by_region_dict[region]), 2)

#Testing the function
for region in regions_list:
    print('The average insurance cost in the {} region is'.format(region),regional_average_cost(costs_by_region_dict,region),'dollars')
    

The average insurance cost in the northeast region is 13406.38 dollars
The average insurance cost in the northwest region is 12417.58 dollars
The average insurance cost in the southeast region is 14735.41 dollars
The average insurance cost in the southwest region is 12346.94 dollars


Can we find the median cost per region?

In [80]:
# We could use the numpy module to reduce lines of code here
def regional_median_costs(costs_by_region_dict):
    median_per_region = {}
    for region in regions_list:
        sorted_costs = sorted(costs_by_region_dict[region])
        n = len(sorted_costs)
        middle = n // 2
        
        if n % 2 == 0:
            median = (sorted_costs[middle - 1] + sorted_costs[middle]) // 2
        else:
            median = sorted_costs[middle]
            
        median_per_region[region] = median
    return median_per_region

# Let modules do the calculation
def alternate_regional_median_costs(costs_by_region_dict):
    median_per_region = {}
    for region in regions_list:
        median_per_region[region]=numpy.median(costs_by_region_dict[region])
    return median_per_region
        
median_costs = regional_median_costs(costs_by_region_dict)
alt_median_costs = alternate_regional_median_costs(costs_by_region_dict)
print(median_costs)
print(alt_median_costs)
        

{'northeast': 10057.0, 'northwest': 8965.8, 'southeast': 9294.0, 'southwest': 8798.59}
{'northeast': 10057.654999999999, 'northwest': 8965.8, 'southeast': 9294.130000000001, 'southwest': 8798.59}


Can we find the standard deviation and the IQR? (Comparing these will at least suggest existence of outliers in the dataset)

In [81]:
# Find standard deviation of each region using 'statistics' module
cost_values = {}
for region in regions_list:
    print(round(statistics.stdev(costs_by_region_dict[region])))
    

11256
11072
13971
11557


In [82]:
# Display IQR, Standard Deviation and Outliers for each region
for region in regions_list:
    region_data = costs_by_region_dict[region]
    stdev = round(statistics.stdev(region_data))
    sorted_data, q1, q3, iqr = patients.calculate_iqr(region_data)
    
    # define the outlier criteria
    lower_fence = q1 - 1.5 * iqr
    upper_fence = q3 + 1.5 * iqr
    outliers = patients.find_outliers(sorted_data, lower_fence, upper_fence)
    
    print(region.capitalize())
    print(f"standard deviation: {stdev}")
    print(f"IQR: {iqr}")
    print(f"difference: {round(abs(stdev - iqr))}")
    print(f"outliers: {outliers}")
    print("\n")

Northeast
standard deviation: 11256
IQR: 11567
difference: 311
outliers: [34254.05, 34617.84, 35069.37, 35147.53, 36189.1, 37270.15, 37607.53, 37701.88, 38511.63, 39047.29, 39125.33, 39556.49, 39597.41, 39774.28, 40904.2, 41034.22, 41097.16, 41919.1, 42111.66, 42303.69, 43254.42, 43753.34, 44641.2, 45710.21, 46255.11, 47896.79, 48517.56, 48549.18, 58571.07]


Northwest
standard deviation: 11072
IQR: 9992
difference: 1080
outliers: [30063.58, 30166.62, 30284.64, 32734.19, 32787.46, 33307.55, 33471.97, 33750.29, 33907.55, 36219.41, 36898.73, 37465.34, 38746.36, 39725.52, 39983.43, 40003.33, 40720.55, 42760.5, 42983.46, 43578.94, 43921.18, 43943.88, 45702.02, 46130.53, 46661.44, 46718.16, 47496.49, 55135.4, 60021.4]


Southeast
standard deviation: 13971
IQR: 15090
difference: 1119
outliers: [42211.14, 42560.43, 42969.85, 43813.87, 43896.38, 44202.65, 44260.75, 44400.41, 44423.8, 44501.4, 45008.96, 45863.21, 46151.12, 46200.99, 46599.11, 46889.26, 47055.53, 47269.85, 47462.89, 48673.56, 48

In [83]:
#Using BMI scale suggested by CDC
bmi_scale = {0:0.0,1:18.5,2:24.9,3:30} 

def sort_costs_by_bmi_bracket(costs,bmi_scale):
    costs_by_bmi_dict={'Underweight':[], 'Healthy':[], 'Overweight':[], 'Obese':[]}
    for bmi_cost_pair in costs_by_bmi:
        if bmi_cost_pair[1] <= bmi_scale[1]:
            costs_by_bmi_dict['Underweight'].append(bmi_cost_pair)
        elif bmi_cost_pair[1] > bmi_scale[1] and bmi_cost_pair[1] <= bmi_scale[2]:
            costs_by_bmi_dict['Healthy'].append(bmi_cost_pair)
        elif bmi_cost_pair[1] > bmi_scale[2] and bmi_cost_pair[1] <= bmi_scale[3]:
            costs_by_bmi_dict['Overweight'].append(bmi_cost_pair)
        elif bmi_cost_pair[1] > bmi_scale[3]:
            costs_by_bmi_dict['Obese'].append(bmi_cost_pair)
    return costs_by_bmi_dict

costs_by_bmi_dict= sort_costs_by_bmi_bracket(costs,bmi_scale)
print(costs_by_bmi_dict)

{'Underweight': [(2775.19, 17.4), (32734.19, 17.8), (1694.8, 16.0), (9644.25, 18.1), (1727.79, 17.8), (12829.46, 17.3), (15006.58, 18.0), (1621.34, 17.5), (14455.64, 17.2), (3167.46, 16.8), (2585.27, 17.4), (4766.02, 18.5), (2680.95, 17.7), (11534.87, 18.3), (6877.98, 17.3), (13204.29, 18.3), (19023.26, 18.3), (9991.04, 18.3), (5116.5, 17.9), (6640.54, 16.8), (3732.63, 17.3)], 'Healthy': [(21984.47, 22.7), (1837.24, 24.6), (2395.17, 23.8), (14451.84, 23.1), (1625.43, 20.4), (2302.3, 20.8), (6272.48, 21.8), (12629.9, 24.5), (23244.79, 22.9), (30166.62, 24.7), (14711.74, 22.4), (17663.14, 24.0), (16577.78, 24.8), (21098.55, 22.9), (10942.13, 24.8), (22412.65, 19.9), (15820.7, 19.3), (6686.43, 23.4), (1705.62, 23.8), (13616.36, 22.0), (27375.9, 22.4), (5125.22, 24.1), (19964.75, 23.4), (21223.68, 24.4), (4827.9, 18.9), (4005.42, 19.9), (13012.21, 24.0), (7147.1, 22.3), (2483.74, 23.1), (25081.77, 23.2), (19515.54, 24.6), (6710.19, 24.5), (19444.27, 22.2), (5354.07, 22.1), (1832.09, 20.9),

In [84]:
#Find average cost for each BMI category
def bmi_average_cost(costs_by_bmi_dict,bmi_category):
    total_cost=0
    for bmi_cost_pair in costs_by_bmi_dict[bmi_category]:
        total_cost+=bmi_cost_pair[0]
    return round(total_cost/len(costs_by_region_dict[region]), 2)

#Testing the function
for bmi_category in costs_by_bmi_dict.keys():
    print('The average insurance cost for {} people in the dataset is'.format(bmi_category),bmi_average_cost(costs_by_bmi_dict,bmi_category),'dollars')

underweight_avg = bmi_average_cost(costs_by_bmi_dict, 'Underweight')
healthy_avg = bmi_average_cost(costs_by_bmi_dict, 'Healthy')
overweight_avg = bmi_average_cost(costs_by_bmi_dict, 'Overweight')
obese_avg = bmi_average_cost(costs_by_bmi_dict, 'Obese')
print(f"\nThe difference between underweight and overweight average is {overweight_avg - underweight_avg}")

The average insurance cost for Underweight people in the dataset is 559.42 dollars
The average insurance cost for Healthy people in the dataset is 7075.33 dollars
The average insurance cost for Overweight people in the dataset is 13525.11 dollars
The average insurance cost for Obese people in the dataset is 33473.45 dollars

The difference between underweight and overweight average is 12965.69


Well that certainly suggests what we were expecting... Let's process the data a bit more to help provide more context to people analyzing this dataset.

In [85]:
bmi_costs_analysis_dict = patients.analyse_costs(costs_by_bmi_dict)
for item in bmi_costs_analysis_dict:
    bracket = bmi_costs_analysis_dict[item]
    mean = bracket['Mean']
    median = bracket['Median']
    stdev = bracket['Standard Deviation']
    iqr = bracket['IQR']
    outliers = bracket['Outliers']
    print(item)
    print(f"average cost: ${mean}")
    print(f"median cost: ${median}")
    print(f"standard deviation: {stdev}")
    print(f"IQR: {iqr}")
    print(f"outliers: {outliers}\n")

Underweight
average cost: $8657.6
median cost: $6640.5
standard deviation: 7591.7
IQR: 10054
outliers: [32734.19]

Healthy
average cost: $10404.9
median cost: $8604.5
standard deviation: 7508.2
IQR: 11327
outliers: [35069.37]

Overweight
average cost: $11044.4
median cost: $8659.4
standard deviation: 8109.0
IQR: 11168
outliers: [32787.46, 33307.55, 35147.53, 35160.13, 37829.72, 38245.59]

Obese
average cost: $15585.8
median cost: $10003.7
standard deviation: 14593.4
IQR: 14433
outliers: [41661.6, 41676.08, 41919.1, 41949.24, 41999.52, 42111.66, 42112.24, 42124.52, 42211.14, 42303.69, 42560.43, 42760.5, 42856.84, 42969.85, 42983.46, 43254.42, 43578.94, 43753.34, 43813.87, 43896.38, 43921.18, 43943.88, 44202.65, 44260.75, 44400.41, 44423.8, 44501.4, 44585.46, 44641.2, 45008.96, 45702.02, 45710.21, 45863.21, 46113.51, 46130.53, 46151.12, 46200.99, 46255.11, 46599.11, 46661.44, 46718.16, 46889.26, 47055.53, 47269.85, 47291.06, 47305.31, 47403.88, 47462.89, 47496.49, 47896.79, 47928.03, 481

In [86]:
#Age groups:
age_scale = {0:12,1:19,2:45,3:65} 
costs_by_age=list(zip(patients.costs,patients.ages))

def sort_costs_by_age_bracket(costs_by_age,age_scale):
    costs_by_age_dict={'Children':[], 'Adolescents':[], 'Young adults':[], 'Middle adults':[],'Older adults':[]}
    for age_cost_pair in costs_by_age:
        if age_cost_pair[1] <= age_scale[0]:
            costs_by_age_dict['Children'].append(age_cost_pair)
        elif age_cost_pair[1] > age_scale[0] and age_cost_pair[1] <= age_scale[1]:
            costs_by_age_dict['Adolescents'].append(age_cost_pair)
        elif age_cost_pair[1] > age_scale[1] and age_cost_pair[1] <= age_scale[2]:
            costs_by_age_dict['Young adults'].append(age_cost_pair)
        elif age_cost_pair[1] > age_scale[2] and age_cost_pair[1] <= age_scale[3]:
            costs_by_age_dict['Middle adults'].append(age_cost_pair)
        elif age_cost_pair[1] > age_scale[3]:
            costs_by_age_dict['Older adults'].append(age_cost_pair)
    return costs_by_age_dict

costs_by_age_dict= sort_costs_by_age_bracket(costs_by_age,age_scale)
print(costs_by_age_dict['Middle adults']) #Testing the code

[(8240.59, 46), (28923.14, 60), (27808.73, 62), (11090.72, 56), (10797.34, 52), (10602.39, 56), (13228.85, 60), (14001.13, 59), (14451.84, 63), (12268.63, 55), (13770.1, 63), (15612.19, 62), (48173.36, 60), (20630.28, 55), (12629.9, 60), (23568.27, 48), (47496.49, 58), (13607.37, 58), (23244.79, 53), (30166.62, 64), (14235.07, 61), (11741.73, 53), (11946.63, 58), (11356.66, 57), (11033.66, 48), (43578.94, 57), (11073.18, 56), (8026.67, 46), (11082.58, 55), (10942.13, 53), (30184.94, 59), (47291.06, 64), (12105.32, 54), (10226.28, 55), (22412.65, 56), (30942.19, 61), (47055.53, 63), (10825.25, 54), (11881.36, 55), (11488.32, 52), (30260.0, 60), (11381.33, 58), (8601.33, 49), (10115.01, 47), (9634.54, 52), (12815.44, 59), (13616.36, 61), (11163.57, 53), (27322.73, 54), (40720.55, 46), (9877.61, 51), (10959.69, 53), (7789.64, 48), (21223.68, 48), (19749.38, 50), (10450.55, 54), (10407.09, 47), (13405.39, 63), (8116.68, 49), (48824.45, 63), (10436.1, 54), (8823.28, 46), (11735.88, 58), (12

In [87]:
age_costs_analysis_dict = patients.analyse_costs(costs_by_age_dict)
for item in age_costs_analysis_dict:
    bracket = age_costs_analysis_dict[item]
    if len(bracket)>=2:
        mean = bracket['Mean']
        median = bracket['Median']
        stdev = bracket['Standard Deviation']
        iqr = bracket['IQR']
        outliers = bracket['Outliers']
        print(item)
        print(f"average cost: ${mean}")
        print(f"median: {median}")
        print(f"standard deviation: {stdev}")
        print(f"IQR: {iqr}")
        print(f"outliers: {outliers}\n")
    else:
        print(item)
        print('Data insufficient')

Adolescents
average cost: $8407.3
median: 2138.1
standard deviation: 11418.8
IQR: 12113
outliers: [32548.34, 33307.55, 33732.69, 33750.29, 34303.17, 34439.86, 34617.84, 34779.61, 34828.65, 36149.48, 36219.41, 36307.8, 36397.58, 36898.73, 38792.69, 39722.75]

Young adults
average cost: $11417.7
median: 6272.5
standard deviation: 11897.8
IQR: 12581
outliers: [35491.64, 35585.58, 35595.59, 36021.01, 36085.22, 36124.57, 36189.1, 36197.7, 36837.47, 36950.26, 37079.37, 37133.9, 37165.16, 37270.15, 37465.34, 37484.45, 37607.53, 37701.88, 37742.58, 37829.72, 38126.25, 38245.59, 38282.75, 38344.57, 38415.47, 38511.63, 38709.18, 38711.0, 38746.36, 38998.55, 39047.29, 39125.33, 39241.44, 39556.49, 39597.41, 39611.76, 39725.52, 39774.28, 39836.52, 39871.7, 39983.43, 40003.33, 40103.89, 40182.25, 40273.65, 40419.02, 40904.2, 40932.43, 40941.29, 41034.22, 41949.24, 42112.24, 42124.52, 42560.43, 42760.5, 42983.46, 43753.34, 43896.38, 43943.88, 44501.4, 44585.46, 45863.21, 46113.51, 46200.99, 48885.14

Now we can compare costs between smokers and non-smokers

In [88]:
costs_by_smoker=list(zip(patients.costs,patients.smoker))

def sort_costs_by_smoker(costs_by_smoker):
    costs_by_smoker_dict={'Smoker':[],'Non-smoker':[]}
    for cost_smoker_pair in costs_by_smoker:
        if cost_smoker_pair[1]=='yes':
            costs_by_smoker_dict['Smoker'].append(cost_smoker_pair)
        elif cost_smoker_pair[1]=='no':
            costs_by_smoker_dict['Non-smoker'].append(cost_smoker_pair)
    return costs_by_smoker_dict


        
costs_by_smoker_dict=sort_costs_by_smoker(costs_by_smoker)       

smoking_costs_analysis_dict = patients.analyse_costs(costs_by_smoker_dict)
for item in smoking_costs_analysis_dict:
    bracket = smoking_costs_analysis_dict[item]
    if len(bracket)>=2:
        mean = bracket['Mean']
        median = bracket['Median']
        stdev = bracket['Standard Deviation']
        iqr = bracket['IQR']
        outliers = bracket['Outliers']
        print(item)
        print(f"average cost: ${mean}")
        print(f"median: ${median}")
        print(f"standard deviation: {stdev}")
        print(f"IQR: {iqr}")
        print(f"outliers: {outliers}\n")
    else:
        print(item)
        print('Data insufficient')

Smoker
average cost: $32050.2
median: $34456.4
standard deviation: 11541.5
IQR: 20261
outliers: []

Non-smoker
average cost: $8434.3
median: $7345.4
standard deviation: 5993.8
IQR: 7375
outliers: [22493.66, 23045.57, 23082.96, 23241.47, 23288.93, 23563.02, 24059.68, 24227.34, 24476.48, 24513.09, 24603.05, 24671.66, 24915.05, 25081.77, 25333.33, 25517.11, 25656.58, 25992.82, 26018.95, 26140.36, 26236.58, 26392.26, 26467.1, 27000.98, 27117.99, 27322.73, 27346.04, 27375.9, 27724.29, 27941.29, 28287.9, 28340.19, 28468.92, 28476.73, 28923.14, 29186.48, 30063.58, 30166.62, 30260.0, 30284.64, 31620.0, 32108.66, 33471.97, 35160.13, 36580.28, 36910.61]



In [93]:
#Create dictionary keys by iterating over distinct numbers of kids found in data.
num_children_dict_unsorted={}
for num in patients.num_children:
    if num_children_dict_unsorted.get(num) is None:
        num_children_dict_unsorted[num]=[]

num_children_dict = {key:num_children_dict_unsorted[key] for key in sorted(num_children_dict_unsorted.keys())}

costs_by_children=list(zip(patients.costs,patients.num_children))

def sort_costs_by_children_bracket(costs_by_children,num_children_dict):
    num_children_dict_completed=num_children_dict
    for child_cost_pair in costs_by_children:
        for num in num_children_dict.keys():
            if child_cost_pair[1] == num:
                num_children_dict_completed[num].append(child_cost_pair)
    return num_children_dict_completed

costs_by_children_dict_completed=sort_costs_by_children_bracket(costs_by_children,num_children_dict)
costs_by_children_analysis_dict=patients.analyse_costs(costs_by_children_dict_completed)

for item in costs_by_children_analysis_dict:
    bracket = costs_by_children_analysis_dict[item]
    if len(bracket)>=2:
        mean = bracket['Mean']
        median = bracket['Median']
        stdev = bracket['Standard Deviation']
        iqr = bracket['IQR']
        outliers = bracket['Outliers']
        print(item,'kids')
        print(f"average cost: ${mean}")
        print(f"median: ${median}")
        print(f"standard deviation: {stdev}")
        print(f"IQR: {iqr}")
        print(f"outliers: {outliers}\n")
    else:
        print(item)
        print('Data insufficient')



0 kids
average cost: $12366.0
median: $9857.0
standard deviation: 12023.3
IQR: 11718
outliers: [32548.34, 33307.55, 33475.82, 33732.69, 33750.29, 33900.65, 33907.55, 34166.27, 34254.05, 34439.86, 34472.84, 34617.84, 34672.15, 34779.61, 34828.65, 34838.87, 35069.37, 35147.53, 35491.64, 35585.58, 36149.48, 36197.7, 36219.41, 36307.8, 36837.47, 36898.73, 36950.26, 37079.37, 37133.9, 37270.15, 37742.58, 37829.72, 38126.25, 38792.69, 39611.76, 39722.75, 39727.61, 40419.02, 40974.16, 41097.16, 41676.08, 42111.66, 42303.69, 42983.46, 43254.42, 43578.94, 43921.18, 44400.41, 44423.8, 45008.96, 45710.21, 45863.21, 46599.11, 46889.26, 47055.53, 48173.36, 48673.56, 48824.45, 48885.14, 52590.83, 55135.4, 62592.87, 63770.43]

1 kids
average cost: $12731.2
median: $8483.9
standard deviation: 11823.6
IQR: 11655
outliers: [34806.47, 35160.13, 37165.16, 37607.53, 37701.88, 38245.59, 38282.75, 38709.18, 39047.29, 39125.33, 39556.49, 39725.52, 39774.28, 39871.7, 40273.65, 40904.2, 41034.22, 41661.6, 41919