# U.S. Medical Insurance Costs

In this project, we will be using Python fundamentals to explore a CSV file containing medical insurance costs. The objective is to analyze different attributes in the "insurance.csv" file, extract information about the patients, and discover potential applications for the dataset.

By examining this data, we aim to gain insights and better understand the various aspects of insurance costs and patient characteristics.

In [1]:
import csv
import numpy
import statistics

In [2]:
ages = []
sexes = []
bmis = []
num_children = []
smoker = []
regions = []
costs = []

In [3]:
def populate_list(list, source, column):
    with open(source, newline='') as file:
        csvreader = csv.DictReader(file)
        for row in csvreader:
            list.append(row[column])
        return list

Using the helper function populate_list() we can load up our lists with data.

In [4]:
populate_list(ages, 'insurance.csv', 'age')
populate_list(sexes, 'insurance.csv', 'sex')
populate_list(bmis, 'insurance.csv', 'bmi')
populate_list(num_children, 'insurance.csv', 'children')
populate_list(smoker, 'insurance.csv', 'smoker')
populate_list(regions, 'insurance.csv', 'region')
populate_list(costs, 'insurance.csv', 'charges')

['16884.924',
 '1725.5523',
 '4449.462',
 '21984.47061',
 '3866.8552',
 '3756.6216',
 '8240.5896',
 '7281.5056',
 '6406.4107',
 '28923.13692',
 '2721.3208',
 '27808.7251',
 '1826.843',
 '11090.7178',
 '39611.7577',
 '1837.237',
 '10797.3362',
 '2395.17155',
 '10602.385',
 '36837.467',
 '13228.84695',
 '4149.736',
 '1137.011',
 '37701.8768',
 '6203.90175',
 '14001.1338',
 '14451.83515',
 '12268.63225',
 '2775.19215',
 '38711',
 '35585.576',
 '2198.18985',
 '4687.797',
 '13770.0979',
 '51194.55914',
 '1625.43375',
 '15612.19335',
 '2302.3',
 '39774.2763',
 '48173.361',
 '3046.062',
 '4949.7587',
 '6272.4772',
 '6313.759',
 '6079.6715',
 '20630.28351',
 '3393.35635',
 '3556.9223',
 '12629.8967',
 '38709.176',
 '2211.13075',
 '3579.8287',
 '23568.272',
 '37742.5757',
 '8059.6791',
 '47496.49445',
 '13607.36875',
 '34303.1672',
 '23244.7902',
 '5989.52365',
 '8606.2174',
 '4504.6624',
 '30166.61817',
 '4133.64165',
 '14711.7438',
 '1743.214',
 '14235.072',
 '6389.37785',
 '5920.1041',
 '176

Next we will want to consider if cleaning data and reformatting is necessary. It looks like rounding BMI to one decimal and rounding costs to two decimals makes sense.

In [5]:
# clean bmis list, converting strings to float and rounding nearest decimal
bmis = [round(float(num), 1) for num in bmis]
# clean costs list, converting strings to float and rounding nearest 2 decimals
costs= [round(float(num), 2) for num in costs]
# convert ages strings to ints
ages= [int(age) for age in ages]

Build a class to run initial analyses of data including average age, how many men and women, what region most patients are from, and costs related to smoker status.

In [6]:
class analyse_patient_data:
    def __init__(self, ages, sexes, bmis, num_children, smoker, regions, costs):
        self.ages = ages
        self.sexes = sexes
        self.bmis = bmis
        self.num_children = num_children
        self.smoker = smoker
        self.regions = regions
        self.costs = costs

    def average_age(self):
        total = 0
        for age in self.ages:
            total+=int(age)
        print(f"Average age of patients is {total // len(self.ages)} years")
        
    def total_by_sex(self):
        total_women = 0
        total_men = 0
        for sex in self.sexes:
            if sex == "female":
                total_women+=1
            else:
                total_men+=1
        print(f"Total records for women: {total_women}\nTotal records for men: {total_men}")
    
    def unique_regions(self):
        regions_list=[]
        for region in self.regions:
            if region not in regions_list:
                regions_list.append(region)
        regions_list.sort()
        return regions_list
        
    # Not sure this will go here; may be better lower down in further analysis
    def most_popular_region(self):
        pass
    
    # Returns dictionary containing all data from insurance.csv by column
    def create_dict(self):
        patient_dict = {}
        patient_dict['Ages'] = self.ages
        patient_dict['Sexes'] = self.sexes
        patient_dict['BMIs'] = self.bmis
        patient_dict['Number of Children'] = self.num_children
        patient_dict['Smoker Status'] = self.smoker
        patient_dict['Regions'] = self.regions
        patient_dict['Total Costs'] = self.costs
        return patient_dict

In [7]:
# Create instance of analyse_patient_data with data from insrance.csv
patients = analyse_patient_data(ages, sexes, bmis, num_children, smoker, regions, costs)
patients.average_age()
patients.total_by_sex()
patients.unique_regions()
patients.create_dict()

Average age of patients is 39 years
Total records for women: 662
Total records for men: 676


{'Ages': [19,
  18,
  28,
  33,
  32,
  31,
  46,
  37,
  37,
  60,
  25,
  62,
  23,
  56,
  27,
  19,
  52,
  23,
  56,
  30,
  60,
  30,
  18,
  34,
  37,
  59,
  63,
  55,
  23,
  31,
  22,
  18,
  19,
  63,
  28,
  19,
  62,
  26,
  35,
  60,
  24,
  31,
  41,
  37,
  38,
  55,
  18,
  28,
  60,
  36,
  18,
  21,
  48,
  36,
  40,
  58,
  58,
  18,
  53,
  34,
  43,
  25,
  64,
  28,
  20,
  19,
  61,
  40,
  40,
  28,
  27,
  31,
  53,
  58,
  44,
  57,
  29,
  21,
  22,
  41,
  31,
  45,
  22,
  48,
  37,
  45,
  57,
  56,
  46,
  55,
  21,
  53,
  59,
  35,
  64,
  28,
  54,
  55,
  56,
  38,
  41,
  30,
  18,
  61,
  34,
  20,
  19,
  26,
  29,
  63,
  54,
  55,
  37,
  21,
  52,
  60,
  58,
  29,
  49,
  37,
  44,
  18,
  20,
  44,
  47,
  26,
  19,
  52,
  32,
  38,
  59,
  61,
  53,
  19,
  20,
  22,
  19,
  22,
  54,
  22,
  34,
  26,
  34,
  29,
  30,
  29,
  46,
  51,
  53,
  19,
  35,
  48,
  32,
  42,
  40,
  44,
  48,
  18,
  30,
  50,
  42,
  18,
  54,
  32,
  37,
  

Sorting costs by different variables, starting with cost by region:

In [8]:
costs_by_region=list(zip(patients.regions,patients.costs)) 

#Insurance costs sorted by region, I started with this because I suspected it might be particularly tricky.
#regions_list=[]
#for region in regions:
#    if region not in regions_list:
#        regions_list.append(region)
#regions_list.sort()
#print(regions_list)

costs_by_region_dict={region:[] for region in patients.unique_regions()}

for patient in costs_by_region:
    for region in patients.unique_regions():
        if patient[0]==region:
            costs_by_region_dict[region].append(patient[1])

print(costs_by_region_dict)

#Could we identify regions which are the 'most expensive'?

{'northeast': [6406.41, 2721.32, 10797.34, 2395.17, 13228.85, 37701.88, 14451.84, 2198.19, 39774.28, 3046.06, 6079.67, 3393.36, 2211.13, 13607.37, 8606.22, 6799.46, 2755.02, 4441.21, 7935.29, 30184.94, 22412.65, 3645.09, 21344.85, 11488.32, 30260.0, 1705.62, 39556.49, 3385.4, 12815.44, 13616.36, 2457.21, 27375.9, 3490.55, 6334.34, 19964.75, 7077.19, 15518.18, 10407.09, 4827.9, 1694.8, 8538.29, 4005.42, 43753.34, 14901.52, 4337.74, 20984.09, 6610.11, 10564.88, 7358.18, 9225.26, 38511.63, 5354.07, 29523.17, 4040.56, 12829.46, 41097.16, 13047.33, 24869.84, 14590.63, 9282.48, 9617.66, 9715.84, 22331.57, 48549.18, 4237.13, 11879.1, 9432.93, 47896.79, 20277.81, 1704.57, 6746.74, 24873.38, 11944.59, 9722.77, 10435.07, 4667.61, 24671.66, 11566.3, 6600.21, 48517.56, 11658.38, 19144.58, 41919.1, 13217.09, 13981.85, 8334.46, 12404.88, 10043.25, 9778.35, 13430.26, 3481.87, 12029.29, 7639.42, 21659.93, 15006.58, 42303.69, 8302.54, 10736.87, 8964.06, 9290.14, 9411.0, 11396.9, 19594.81, 14455.64, 391

At least now we can find the average cost per region

In [9]:
def regional_average_cost(costs_by_region_dict,region):
    total_cost=0
    for cost in costs_by_region_dict[region]:
        total_cost+=float(cost)
    return round(total_cost/len(costs_by_region_dict[region]), 2)

#Testing the function
for region in patients.unique_regions():
    print('The average insurance cost in the {} region is'.format(region),regional_average_cost(costs_by_region_dict,region),'dollars')
    

The average insurance cost in the northeast region is 13406.38 dollars
The average insurance cost in the northwest region is 12417.58 dollars
The average insurance cost in the southeast region is 14735.41 dollars
The average insurance cost in the southwest region is 12346.94 dollars


Can we find the median cost per region?

In [10]:
# We could use the numpy module to reduce lines of code here
def regional_median_costs(costs_by_region_dict):
    median_per_region = {}
    for region in patients.unique_regions():
        sorted_costs = sorted(costs_by_region_dict[region])
        n = len(sorted_costs)
        middle = n // 2
        
        if n % 2 == 0:
            median = (sorted_costs[middle - 1] + sorted_costs[middle]) // 2
        else:
            median = sorted_costs[middle]
            
        median_per_region[region] = median
    return median_per_region

def alternate_regional_median_costs(costs_by_region_dict):
    median_per_region = {}
    for region in patients.unique_regions():
        median_per_region[region]=numpy.median(costs_by_region_dict[region])
    return median_per_region
        
median_costs = regional_median_costs(costs_by_region_dict)
alt_median_costs = alternate_regional_median_costs(costs_by_region_dict)
print(median_costs)
print(alt_median_costs)
        

{'northeast': 10057.0, 'northwest': 8965.8, 'southeast': 9294.0, 'southwest': 8798.59}
{'northeast': 10057.654999999999, 'northwest': 8965.8, 'southeast': 9294.130000000001, 'southwest': 8798.59}


Can we find the standard deviation and the IQR? (Comparing these will at least suggest existence of outliers in the dataset)

In [11]:
# Find standard deviation of each region using 'statistics' module
cost_values = {}
for region in patients.unique_regions():
    print(round(statistics.stdev(costs_by_region_dict[region])))
    

11256
11072
13971
11557


In [12]:
# Helper functions 
def find_outliers(data, lower_fence, upper_fence):
    outliers = [value for value in data if value < lower_fence or value > upper_fence]
    return outliers

def calculate_iqr(data):
    sorted_data = sorted(data)
    n = len(sorted_data)
    q3,q1=numpy.percentile(n,[75 ,25])
    #q1 = sorted_data[int(n * 0.25)]
    #q3 = sorted_data[int(n * 0.75)]
    iqr = round(q3 - q1)
    return sorted_data, q1, q3, iqr


In [13]:
# Display IQR, Standard Deviation and Outliers for each region
for region in patients.unique_regions():
    region_data = costs_by_region_dict[region]
    stdev = round(statistics.stdev(region_data))
    sorted_data, q1, q3, iqr = calculate_iqr(region_data)
    
    # define the outlier criteria
    lower_fence = q1 - 1.5 * iqr
    upper_fence = q3 + 1.5 * iqr
    outliers = find_outliers(sorted_data, lower_fence, upper_fence)
    
    print(region.capitalize())
    print(f"standard deviation: {stdev}")
    print(f"IQR: {iqr}")
    print(f"difference: {round(abs(stdev - iqr))}")
    print(f"outliers: {outliers}")
    print("\n")

Northeast
standard deviation: 11256
IQR: 0
difference: 11256
outliers: [1694.8, 1702.46, 1704.57, 1704.7, 1705.62, 1708.0, 1708.93, 1712.23, 1967.02, 1984.45, 2102.26, 2104.11, 2196.47, 2198.19, 2200.83, 2203.47, 2203.74, 2205.98, 2207.7, 2211.13, 2217.47, 2217.6, 2250.84, 2254.8, 2395.17, 2396.1, 2457.21, 2459.72, 2585.85, 2721.32, 2727.4, 2731.91, 2741.95, 2755.02, 2897.32, 2899.49, 3044.21, 3046.06, 3070.81, 3167.46, 3206.49, 3213.62, 3268.85, 3279.87, 3309.79, 3385.4, 3393.36, 3481.87, 3490.55, 3558.62, 3645.09, 3732.63, 3857.76, 3861.21, 3925.76, 3943.6, 4005.42, 4040.56, 4137.52, 4237.13, 4337.74, 4347.02, 4428.89, 4435.09, 4438.26, 4441.21, 4500.34, 4518.83, 4544.23, 4561.19, 4564.19, 4661.29, 4667.61, 4718.2, 4719.52, 4827.9, 4846.92, 4915.06, 4992.38, 5031.27, 5148.55, 5209.58, 5227.99, 5354.07, 5428.73, 5458.05, 5469.01, 5594.85, 5757.41, 6067.13, 6079.67, 6113.23, 6334.34, 6402.29, 6406.41, 6551.75, 6555.07, 6600.21, 6610.11, 6640.54, 6746.74, 6748.59, 6753.04, 6799.46, 6837

In [14]:
#Using BMI scale suggested by CDC
bmi_scale = {0:0.0,1:18.5,2:24.9,3:30} 
costs_by_bmi=list(zip(patients.costs,patients.bmis))

def sort_costs_by_bmi_bracket(costs_by_bmi,bmi_scale):
    costs_by_bmi_dict={'Underweight':[], 'Healthy':[], 'Overweight':[], 'Obese':[]}
    for bmi_cost_pair in costs_by_bmi:
        if bmi_cost_pair[1] <= bmi_scale[1]:
            costs_by_bmi_dict['Underweight'].append(bmi_cost_pair)
        elif bmi_cost_pair[1] > bmi_scale[1] and bmi_cost_pair[1] <= bmi_scale[2]:
            costs_by_bmi_dict['Healthy'].append(bmi_cost_pair)
        elif bmi_cost_pair[1] > bmi_scale[2] and bmi_cost_pair[1] <= bmi_scale[3]:
            costs_by_bmi_dict['Overweight'].append(bmi_cost_pair)
        elif bmi_cost_pair[1] > bmi_scale[3]:
            costs_by_bmi_dict['Obese'].append(bmi_cost_pair)
    return costs_by_bmi_dict

costs_by_bmi_dict= sort_costs_by_bmi_bracket(costs_by_bmi,bmi_scale)
print(costs_by_bmi)
print(costs_by_bmi_dict['Obese'])

[(16884.92, 27.9), (1725.55, 33.8), (4449.46, 33.0), (21984.47, 22.7), (3866.86, 28.9), (3756.62, 25.7), (8240.59, 33.4), (7281.51, 27.7), (6406.41, 29.8), (28923.14, 25.8), (2721.32, 26.2), (27808.73, 26.3), (1826.84, 34.4), (11090.72, 39.8), (39611.76, 42.1), (1837.24, 24.6), (10797.34, 30.8), (2395.17, 23.8), (10602.39, 40.3), (36837.47, 35.3), (13228.85, 36.0), (4149.74, 32.4), (1137.01, 34.1), (37701.88, 31.9), (6203.9, 28.0), (14001.13, 27.7), (14451.84, 23.1), (12268.63, 32.8), (2775.19, 17.4), (38711.0, 36.3), (35585.58, 35.6), (2198.19, 26.3), (4687.8, 28.6), (13770.1, 28.3), (51194.56, 36.4), (1625.43, 20.4), (15612.19, 33.0), (2302.3, 20.8), (39774.28, 36.7), (48173.36, 39.9), (3046.06, 26.6), (4949.76, 36.6), (6272.48, 21.8), (6313.76, 30.8), (6079.67, 37.0), (20630.28, 37.3), (3393.36, 38.7), (3556.92, 34.8), (12629.9, 24.5), (38709.18, 35.2), (2211.13, 35.6), (3579.83, 33.6), (23568.27, 28.0), (37742.58, 34.4), (8059.68, 28.7), (47496.49, 37.0), (13607.37, 31.8), (34303.1

In [15]:
#Find average cost for each BMI category
def bmi_average_cost(costs_by_bmi_dict,bmi_category):
    total_cost=0
    if len(costs_by_bmi_dict[bmi_category])!=0:
        for bmi_cost_pair in costs_by_bmi_dict[bmi_category]:
            total_cost+=bmi_cost_pair[0]
        return round(total_cost/len(costs_by_bmi_dict[bmi_category]), 2)
    else:
        return 'N/A'

#Testing the function
for bmi_category in costs_by_bmi_dict.keys():
    print('The average insurance cost for {} people in the dataset is'.format(bmi_category),bmi_average_cost(costs_by_bmi_dict,bmi_category),'dollars')

underweight_avg = bmi_average_cost(costs_by_bmi_dict, 'Underweight')
healthy_avg = bmi_average_cost(costs_by_bmi_dict, 'Healthy')
overweight_avg = bmi_average_cost(costs_by_bmi_dict, 'Overweight')
obese_avg = bmi_average_cost(costs_by_bmi_dict, 'Obese')
print(f"\nThe difference between underweight and overweight average is {overweight_avg - underweight_avg}")

The average insurance cost for Underweight people in the dataset is 8657.62 dollars
The average insurance cost for Healthy people in the dataset is 10404.9 dollars
The average insurance cost for Overweight people in the dataset is 11044.37 dollars
The average insurance cost for Obese people in the dataset is 15585.78 dollars

The difference between underweight and overweight average is 2386.75


Well that certainly suggests what we were expecting... Let's process the data a bit more to help provide more context to people analyzing this dataset.

In [16]:
# Find median, standard deviation, IQR and identify outliers

def analyse_bmi_costs(costs_by_bmi):
    brackets = {}
    for bracket in costs_by_bmi:
        costs = []
        for cost_bmi_pair in costs_by_bmi_dict[bracket]:
            costs.append(cost_bmi_pair[0])
            
        # get IQR and quartiles
        sorted_data, q1, q3, iqr = calculate_iqr(costs)
        
        # define the outlier criteria
        lower_fence = q1 - 1.5 * iqr
        upper_fence = q3 + 1.5 * iqr
        outliers = find_outliers(sorted_data, lower_fence, upper_fence)
        
        brackets[bracket] = {
            'Median': round(numpy.median(costs), 1),
            'Standard Deviation': round(statistics.stdev(costs), 1),
            'IQR': iqr,
            'Outliers': outliers
        }
    return brackets

bmi_costs_analysis_dict = analyse_bmi_costs(costs_by_bmi_dict)
for item in bmi_costs_analysis_dict:
    bracket = bmi_costs_analysis_dict[item]
    median = bracket['Median']
    stdev = bracket['Standard Deviation']
    iqr = bracket['IQR']
    outliers = bracket['Outliers']
    print(item)
    print(f"median: {median}")
    print(f"standard deviation: {stdev}")
    print(f"IQR: {iqr}")
    print(f"outliers: {outliers}\n")

Underweight
median: 6640.5
standard deviation: 7591.7
IQR: 0
outliers: [1621.34, 1694.8, 1727.79, 2585.27, 2680.95, 2775.19, 3167.46, 3732.63, 4766.02, 5116.5, 6640.54, 6877.98, 9644.25, 9991.04, 11534.87, 12829.46, 13204.29, 14455.64, 15006.58, 19023.26, 32734.19]

Healthy
median: 8604.5
standard deviation: 7508.2
IQR: 0
outliers: [1121.87, 1241.57, 1242.26, 1242.82, 1515.34, 1607.51, 1625.43, 1627.28, 1628.47, 1702.46, 1704.57, 1704.7, 1705.62, 1711.03, 1728.9, 1731.68, 1737.38, 1832.09, 1837.24, 1964.78, 1969.61, 2103.08, 2117.34, 2150.47, 2201.1, 2302.3, 2352.97, 2395.17, 2396.1, 2457.5, 2483.74, 2527.82, 2585.85, 2709.11, 2709.24, 2731.91, 2803.7, 2842.76, 2913.57, 2974.13, 3077.1, 3176.29, 3176.82, 3180.51, 3206.49, 3208.79, 3260.2, 3353.47, 3378.91, 3484.33, 3561.89, 3594.17, 3847.67, 3861.21, 4005.42, 4032.24, 4134.08, 4185.1, 4296.27, 4391.65, 4428.89, 4500.34, 4529.48, 4544.23, 4718.2, 4719.52, 4719.74, 4827.9, 4906.41, 4931.65, 4992.38, 5080.1, 5125.22, 5209.58, 5257.51, 535

The next step: finding the average insurance cost for each age group.

In [17]:
#Age groups:
age_scale = {0:12,1:19,2:45,3:65} 
costs_by_age=list(zip(patients.costs,patients.ages))

def sort_costs_by_age_bracket(costs_by_age,age_scale):
    costs_by_age_dict={'Children':[], 'Adolescents':[], 'Young adults':[], 'Middle adults':[],'Older adults':[]}
    for age_cost_pair in costs_by_age:
        if age_cost_pair[1] <= age_scale[0]:
            costs_by_age_dict['Children'].append(age_cost_pair)
        elif age_cost_pair[1] > age_scale[0] and age_cost_pair[1] <= age_scale[1]:
            costs_by_age_dict['Adolescents'].append(age_cost_pair)
        elif age_cost_pair[1] > age_scale[1] and age_cost_pair[1] <= age_scale[2]:
            costs_by_age_dict['Young adults'].append(age_cost_pair)
        elif age_cost_pair[1] > age_scale[2] and age_cost_pair[1] <= age_scale[3]:
            costs_by_age_dict['Middle adults'].append(age_cost_pair)
        elif age_cost_pair[1] > age_scale[3]:
            costs_by_age_dict['Older adults'].append(age_cost_pair)
    return costs_by_age_dict

costs_by_age_dict= sort_costs_by_age_bracket(costs_by_age,age_scale)
print(costs_by_age_dict['Middle adults']) #Testing the code

[(8240.59, 46), (28923.14, 60), (27808.73, 62), (11090.72, 56), (10797.34, 52), (10602.39, 56), (13228.85, 60), (14001.13, 59), (14451.84, 63), (12268.63, 55), (13770.1, 63), (15612.19, 62), (48173.36, 60), (20630.28, 55), (12629.9, 60), (23568.27, 48), (47496.49, 58), (13607.37, 58), (23244.79, 53), (30166.62, 64), (14235.07, 61), (11741.73, 53), (11946.63, 58), (11356.66, 57), (11033.66, 48), (43578.94, 57), (11073.18, 56), (8026.67, 46), (11082.58, 55), (10942.13, 53), (30184.94, 59), (47291.06, 64), (12105.32, 54), (10226.28, 55), (22412.65, 56), (30942.19, 61), (47055.53, 63), (10825.25, 54), (11881.36, 55), (11488.32, 52), (30260.0, 60), (11381.33, 58), (8601.33, 49), (10115.01, 47), (9634.54, 52), (12815.44, 59), (13616.36, 61), (11163.57, 53), (27322.73, 54), (40720.55, 46), (9877.61, 51), (10959.69, 53), (7789.64, 48), (21223.68, 48), (19749.38, 50), (10450.55, 54), (10407.09, 47), (13405.39, 63), (8116.68, 49), (48824.45, 63), (10436.1, 54), (8823.28, 46), (11735.88, 58), (12

In [18]:
#Find average cost for each age category
def age_average_cost(costs_by_age_dict,age_category):
    total_cost=0
    if len(costs_by_age_dict[age_category]) !=0:
        for age_cost_pair in costs_by_age_dict[age_category]:
            total_cost+=age_cost_pair[0]
        return round(total_cost/len(costs_by_age_dict[age_category]), 2)
    else:
        return 'N/A'

#Testing the function
for age_category in costs_by_age_dict.keys():
    print('The average insurance cost for {} in the dataset is'.format(age_category),age_average_cost(costs_by_age_dict,age_category),'dollars')

child_avg = age_average_cost(costs_by_age_dict, 'Children')
adolescent_avg = age_average_cost(costs_by_age_dict, 'Adolescents')
young_adult_avg = age_average_cost(costs_by_age_dict, 'Young adults')
middle_adult_avg = age_average_cost(costs_by_age_dict, 'Middle adults')
older_adult_avg = age_average_cost(costs_by_age_dict, 'Older adults')

The average insurance cost for Children in the dataset is N/A dollars
The average insurance cost for Adolescents in the dataset is 8407.35 dollars
The average insurance cost for Young adults in the dataset is 11417.69 dollars
The average insurance cost for Middle adults in the dataset is 17200.43 dollars
The average insurance cost for Older adults in the dataset is N/A dollars


In [19]:
# Display IQR, Standard Deviation and Outliers for each age category
for age_category in costs_by_age_dict.keys():
    age_data = costs_by_age_dict[age_category]
    if len(age_data)<2:
        print(age_category.capitalize())
        print('Insufficient data')
        
    else:
        #something that works please help
        pass


Children
Insufficient data
Older adults
Insufficient data


Now for comparing smoker vs non-smoker

In [20]:
costs_by_smoker_dict={'Smoker':[],'Non smoker':[]}
costs_by_smoker=list(zip(patients.costs,patients.smoker))
for cost_pair in costs_by_smoker:
    if cost_pair[1]=='yes':
        costs_by_smoker_dict['Smoker'].append(cost_pair[0])
    elif cost_pair[1]=='no':
        costs_by_smoker_dict['Non smoker'].append(cost_pair[0])
print('Smokers',costs_by_smoker_dict['Smoker']) 
print('Non-smokers',costs_by_smoker_dict['Non smoker']) 
total_cost_smokers=0
total_cost_non_smokers=0
for cost in costs_by_smoker_dict['Smoker']:
    total_cost_smokers+=cost
for cost in costs_by_smoker_dict['Non smoker']:
    total_cost_non_smokers+=cost
print('The average cost for smokers is',round(total_cost_smokers/len(costs_by_smoker_dict['Smoker'])),'dollars')
print('The average cost for non smokers is',round(total_cost_non_smokers/len(costs_by_smoker_dict['Non smoker'])),'dollars')



Smokers [16884.92, 27808.73, 39611.76, 36837.47, 37701.88, 38711.0, 35585.58, 51194.56, 39774.28, 48173.36, 38709.18, 23568.27, 37742.58, 47496.49, 34303.17, 23244.79, 14711.74, 17663.14, 16577.78, 37165.16, 39836.52, 21098.55, 43578.94, 30184.94, 47291.06, 22412.65, 15820.7, 30942.19, 17560.38, 47055.53, 19107.78, 39556.49, 17081.08, 32734.19, 18972.49, 20745.99, 40720.55, 19964.75, 21223.68, 15518.18, 36950.26, 21348.71, 36149.48, 48824.45, 43753.34, 37133.9, 20984.09, 34779.61, 19515.54, 19444.27, 17352.68, 38511.63, 29523.17, 12829.46, 47305.31, 44260.75, 41097.16, 43921.18, 33750.29, 17085.27, 24869.84, 36219.41, 46151.12, 17179.52, 42856.84, 22331.57, 48549.18, 47896.79, 42112.24, 16297.85, 21978.68, 38746.36, 24873.38, 42124.52, 34838.87, 35491.64, 42760.5, 47928.03, 48517.56, 24393.62, 41919.1, 13844.51, 36085.22, 18033.97, 21659.93, 38126.25, 15006.58, 42303.69, 19594.81, 14455.64, 18608.26, 28950.47, 46889.26, 46599.11, 39125.33, 37079.37, 26109.33, 22144.03, 19521.97, 25382.

In [21]:
# Display IQR, Standard Deviation and Outliers for smokers vs non smokers