# U.S. Medical Insurance Costs

In this project, we will be using Python fundamentals to explore a CSV file containing medical insurance costs. The objective is to analyze different attributes in the "insurance.csv" file, extract information about the patients, and discover potential applications for the dataset.

By examining this data, we aim to gain insights and better understand the various aspects of insurance costs and patient characteristics.

In [183]:
import csv
import numpy as np
import statistics

In [184]:
ages = []
sexes = []
bmis = []
num_children = []
smoker = []
regions = []
costs = []

In [185]:
def populate_list(list, source, column):
    with open(source, newline='') as file:
        csvreader = csv.DictReader(file)
        for row in csvreader:
            list.append(row[column])
        return list

Using the helper function populate_list() we can load up our lists with data.

In [186]:
populate_list(ages, 'insurance.csv', 'age')
populate_list(sexes, 'insurance.csv', 'sex')
populate_list(bmis, 'insurance.csv', 'bmi')
populate_list(num_children, 'insurance.csv', 'children')
populate_list(smoker, 'insurance.csv', 'smoker')
populate_list(regions, 'insurance.csv', 'region')
populate_list(costs, 'insurance.csv', 'charges')

['16884.924',
 '1725.5523',
 '4449.462',
 '21984.47061',
 '3866.8552',
 '3756.6216',
 '8240.5896',
 '7281.5056',
 '6406.4107',
 '28923.13692',
 '2721.3208',
 '27808.7251',
 '1826.843',
 '11090.7178',
 '39611.7577',
 '1837.237',
 '10797.3362',
 '2395.17155',
 '10602.385',
 '36837.467',
 '13228.84695',
 '4149.736',
 '1137.011',
 '37701.8768',
 '6203.90175',
 '14001.1338',
 '14451.83515',
 '12268.63225',
 '2775.19215',
 '38711',
 '35585.576',
 '2198.18985',
 '4687.797',
 '13770.0979',
 '51194.55914',
 '1625.43375',
 '15612.19335',
 '2302.3',
 '39774.2763',
 '48173.361',
 '3046.062',
 '4949.7587',
 '6272.4772',
 '6313.759',
 '6079.6715',
 '20630.28351',
 '3393.35635',
 '3556.9223',
 '12629.8967',
 '38709.176',
 '2211.13075',
 '3579.8287',
 '23568.272',
 '37742.5757',
 '8059.6791',
 '47496.49445',
 '13607.36875',
 '34303.1672',
 '23244.7902',
 '5989.52365',
 '8606.2174',
 '4504.6624',
 '30166.61817',
 '4133.64165',
 '14711.7438',
 '1743.214',
 '14235.072',
 '6389.37785',
 '5920.1041',
 '176

Next we will want to consider if cleaning data and reformatting is necessary. It looks like rounding BMI to one decimal and rounding costs to two decimals makes sense.

In [187]:
# clean bmis list, converting strings to float and rounding nearest decimal
bmis = [round(float(num), 1) for num in bmis]
# clean costs list, converting strings to float and rounding nearest 2 decimals
costs= [round(float(num), 2) for num in costs]

Sorting costs by different variables, starting with cost by region:

In [188]:
costs_by_bmi=list(zip(costs,bmis))
costs_by_age=list(zip(costs,ages))
costs_by_num_children=list(zip(costs,num_children))
costs_by_region=list(zip(regions,costs)) 

#Insurance costs sorted by region, I started with this because I suspected it might be particularly tricky.
regions_list=[]
for region in regions:
    if region not in regions_list:
        regions_list.append(region)
regions_list.sort()
print(regions_list)

costs_by_region_dict={region:[] for region in regions_list}

for patient in costs_by_region:
    for region in regions_list:
        if patient[0]==region:
            costs_by_region_dict[region].append(patient[1])

print(costs_by_region_dict)

#Could we identify regions which are the 'most expensive'?

['northeast', 'northwest', 'southeast', 'southwest']
{'northeast': [6406.41, 2721.32, 10797.34, 2395.17, 13228.85, 37701.88, 14451.84, 2198.19, 39774.28, 3046.06, 6079.67, 3393.36, 2211.13, 13607.37, 8606.22, 6799.46, 2755.02, 4441.21, 7935.29, 30184.94, 22412.65, 3645.09, 21344.85, 11488.32, 30260.0, 1705.62, 39556.49, 3385.4, 12815.44, 13616.36, 2457.21, 27375.9, 3490.55, 6334.34, 19964.75, 7077.19, 15518.18, 10407.09, 4827.9, 1694.8, 8538.29, 4005.42, 43753.34, 14901.52, 4337.74, 20984.09, 6610.11, 10564.88, 7358.18, 9225.26, 38511.63, 5354.07, 29523.17, 4040.56, 12829.46, 41097.16, 13047.33, 24869.84, 14590.63, 9282.48, 9617.66, 9715.84, 22331.57, 48549.18, 4237.13, 11879.1, 9432.93, 47896.79, 20277.81, 1704.57, 6746.74, 24873.38, 11944.59, 9722.77, 10435.07, 4667.61, 24671.66, 11566.3, 6600.21, 48517.56, 11658.38, 19144.58, 41919.1, 13217.09, 13981.85, 8334.46, 12404.88, 10043.25, 9778.35, 13430.26, 3481.87, 12029.29, 7639.42, 21659.93, 15006.58, 42303.69, 8302.54, 10736.87, 8964.

At least now we can find the average cost per region

In [189]:
def regional_average_cost(costs_by_region_dict,region):
    total_cost=0
    for cost in costs_by_region_dict[region]:
        total_cost+=float(cost)
    return round(total_cost/len(costs_by_region_dict[region]), 2)
#Testing the function
for region in regions_list:
    print('The average insurance cost in the {} region is'.format(region),regional_average_cost(costs_by_region_dict,region),'dollars')
    

The average insurance cost in the northeast region is 13406.38 dollars
The average insurance cost in the northwest region is 12417.58 dollars
The average insurance cost in the southeast region is 14735.41 dollars
The average insurance cost in the southwest region is 12346.94 dollars


Can we find the median cost per region?

In [190]:
# We could use the numpy module to reduce lines of code here
def regional_median_costs(costs_by_region_dict):
    median_per_region = {}
    for region in regions_list:
        sorted_costs = sorted(costs_by_region_dict[region])
        n = len(sorted_costs)
        middle = n // 2
        
        if n % 2 == 0:
            median = sorted_costs[middle - 1] + sorted_costs[middle] // 2
        else:
            median = sorted_costs[middle]
            
        median_per_region[region] = median
    return median_per_region
        
regional_median_costs = regional_median_costs(costs_by_region_dict)
print(regional_median_costs)

{'northeast': 15079.25, 'northwest': 8965.8, 'southeast': 13935.56, 'southwest': 8798.59}


Can we find the standard deviation and the IQR? (Comparing these will at least suggest existence of outliers in the dataset)

In [191]:
# Find standard deviation of each region using 'statistics' module
cost_values = {}
for region in regions_list:
    print(round(statistics.stdev(costs_by_region_dict[region])))
    

11256
11072
13971
11557


In [192]:
# Find IQR of each region
def calculate_iqr(region_costs):
    sorted_costs = sorted(region_costs)
    n = len(sorted_costs)
    q1 = sorted_costs[int(n * 0.25)]
    q3 = sorted_costs[int(n * 0.75)]
    iqr = q3 - q1
    return iqr

# Print mean and IQR along with absolute difference (non-negative number)
for region in regions_list:
    mean = round(statistics.stdev(costs_by_region_dict[region]))
    iqr = round(calculate_iqr(costs_by_region_dict[region]))
    print(region, {'median': mean, 'IQR': iqr, 'diff': abs(mean - iqr)})


northeast {'median': 11256, 'IQR': 11567, 'diff': 311}
northwest {'median': 11072, 'IQR': 9992, 'diff': 1080}
southeast {'median': 13971, 'IQR': 15090, 'diff': 1119}
southwest {'median': 11557, 'IQR': 8711, 'diff': 2846}
