# U.S. Medical Insurance Costs

### Look over data in csv file.
#### Import csv for parsing and saving data to lists

In [63]:
# import python csv module to read and write tabular data in CSV format
import csv

In [367]:
# create empty list to store dict rows of medical insurance data
medical_insurance_data = []

# open csv file and parse data with Dictreader
with open('insurance.csv', newline='') as insurance_data:
    reader = csv.DictReader(insurance_data)
    for row in reader:
        medical_insurance_data.append(row)

# sort data by lowest cost to highest       
medical_insurance_data = sorted(medical_insurance_data, key=lambda y: round(float(y['charges'])))        
                      
# Almost created a bunch of lists to store corresponding data, then got bored and created global 
# lists based on keys, probably not supposed to do that.        
for item in medical_insurance_data:
    for key, value in item.items():
        # try to append to a list if exists, if not handle error and create list
        try:
            globals()[key].append(value)
        except:
            globals()[key] = [value]

In [366]:
# list of keys
for v in medical_insurance_data[0].keys():
    print(f'{v} = []')

age = []
sex = []
bmi = []
children = []
smoker = []
region = []
charges = []


In [133]:
# code illustrating that lists were succesfully created
print(age[0], sex[0], bmi[0], children[0], smoker[0], region[0], charges[0])
print(medical_insurance_data[0])

19 female 27.9 0 yes southwest 16884.924
{'age': '18', 'sex': 'male', 'bmi': '23.21', 'children': '0', 'smoker': 'no', 'region': 'southeast', 'charges': '1121.8739'}


### Brainstorm some possible ideas for analysis.

- Look at average cost and region.
- Look at correlation between age and smoker.
- Look at bmi, see if there are any correlations between bmi and smoker.
- Cost based on gender, age, smoker.
- Find out the average age of the patients in the dataset.
- Analyze where a majority of the individuals are from.
- Look at the different costs between smokers vs. non-smokers.
- Figure out what the average age is for someone who has at least one child in this dataset.

In [134]:
# method for summing up the total of charges after changing string into float
def charges_total(charges):
    total_charges = 0
    for value in charges:
        total_charges += float(value)
    return total_charges

In [663]:
# method for finding average cost of insurance

def average_insurance_cost(charges):
    return round(charges_total(charges) / len(charges), 2)
        
average_insurance_cost(charges)

print(f'''
The average cost of insurance is ${average_insurance_cost(charges)} from {len(charges)} customers. This does not
take into account the outliers.
''')


The average cost of insurance is $13270.42 from 10704 customers. This does not
take into account the outliers.



In [107]:
# Find average cost of health insurace for male vs female to see if there is a bias  

def average_gender_cost(charges, sex):
    
    gender_dataset = {
        "male": {"total": 0, "total_charge": 0}, 
        "female": {"total": 0, "total_charge": 0}, 
    }
    
    for index, gender in enumerate(sex):
        gender_dataset[gender]['total'] += 1
        gender_dataset[gender]['total_charge'] += float(charges[index])
    
    male_total, male_total_charge = gender_dataset['male'].values()
    female_total, female_total_charge = gender_dataset['female'].values()
    
    gender_dataset.get('male')["male_average_cost"] = male_total_charge / male_total
   
    gender_dataset.get('female')["female_average_cost"] = female_total_charge / female_total
    
    return gender_dataset
        
male, female = average_gender_cost(charges, sex).values() 

male_total, male_total_charge, male_average_cost = male.values()
female_total, female_total_charge, female_average_cost = female.values()

print(f'''
The average cost for being male is {round(male_average_cost, 2)} from {male_total} males
The average cost for being female is {round(female_average_cost, 2)} from {female_total} females
''')
    


The average cost for being male is 13956.75 from 2028 males
The average cost for being female is 12569.58 from 1986 females



In [150]:
# find most expensive region 
def most_expensive_region(medical_insurance_data):
    charges_by_region = {}
    for row in medical_insurance_data:
        if row['region'] in charges_by_region:
            charges_by_region[row['region']]['charges'] += round(float(row['charges']))
            charges_by_region[row['region']]['count'] += 1
        else: 
            charges_by_region[row['region']] = {'charges': float(row['charges']), 'count': 1}
            
    return charges_by_region
           
most_expensive_region(medical_insurance_data)



{'southeast': {'charges': 5363690.8739, 'count': 364},
 'southwest': {'charges': 4012752.26, 'count': 325},
 'northwest': {'charges': 4035713.3402, 'count': 325},
 'northeast': {'charges': 4343658.796399999, 'count': 324}}

In [92]:
def smoker_correlations(smoker, bmi, charges):
    '''Function to find if smoking and bmi have any correlation using smoker, bmi, charges lists'''
#   create dictionaries to hold information on smokers and non smokers
    smoker_totals = {"smoker_cost": 0, "smoker_bmi": 0, "smoker_count": 0}
    non_smoker_totals = {"non_smoker_cost": 0, "non_smoker_bmi": 0, "non_smoker_count": 0}
    
#   iterate over lists and collect data
    for index, item in enumerate(smoker):
        if item == 'yes':
            smoker_totals['smoker_cost'] += float(charges[index])
            smoker_totals['smoker_bmi'] += float(bmi[index])
            smoker_totals['smoker_count'] += 1
        else:
            non_smoker_totals['non_smoker_cost'] += float(charges[index])
            non_smoker_totals['non_smoker_bmi'] += float(bmi[index])
            non_smoker_totals['non_smoker_count'] += 1
    
    st = smoker_totals
    nst = non_smoker_totals
    
    return st, nst
  
st, nst = smoker_correlations(smoker, bmi, charges)

# unpack and assign
smoker_cost, smoker_bmi, smoker_count = st.values()
non_smoker_cost, non_smoker_bmi, non_smoker_count = nst.values()

print(f'''
Although there is a large increase in cost with smoking, from this data there doesn't appear to be an association
between BMI and smoking.
Out of {smoker_count} smokers the average bmi is {round(smoker_bmi / smoker_count)} with an average insurance cost of {round(smoker_cost / smoker_count)}.
Out of {non_smoker_count} non-smokers the average bmi is {round(non_smoker_bmi / non_smoker_count)} with an average insurance cost of {round(non_smoker_cost / non_smoker_count)}.
Smoking does not affect BMI, Smoking does affect insurance cost presumably due to its toxic nature and the assumption
that smokers will require more medical attention in their lifetime.
The smoking population makes up %{round(smoker_cost / charges_total(charges) * 100)} of the total cost.
''')



Although there is a large increase in cost with smoking from this data there doesnt appear to be an association
between BMI and smoking.
Out of 822 smokers the average bmi is 31 with an average insurance cost of 32050.
Out of 3192 non-smokers the average bmi is 31 with an average insurance cost of 8434.
Smoking does not affect BMI, Smoking does affect insurance cost presumably due to its toxic nature and the assumption
that smokers will require more medical attention in their lifetime.
The smoking population makes up %49 of the total cost.



In [667]:
def base_costs_by_region_and_bmi(reg, sex):
    
    bmi_constants = []
    base_amounts = []
    for num in range(18, 60):
        lst = filter(lambda x: (x['smoker'] == 'no' 
                          and x['age'] == str(num) 
                          and x['children'] == '0' 
                          and x['sex'] == sex
                          and x['region'] == reg), medical_insurance_data)
    
    
        srt_lst = sorted([*lst], key=lambda y: round(float(y['charges'])))
        
        if len(srt_lst) > 1:
            first_charges = float(srt_lst[0]['charges'])
            last_charges = float(srt_lst[1]['charges'])
            first_bmi = float(srt_lst[0]['bmi'])
            last_bmi = float(srt_lst[1]['bmi'])
            
            bmi_increase = round((last_charges - first_charges)/(last_bmi - first_bmi), 5)
            bmi_constants.append(bmi_increase)
            base_amount = first_charges - (first_bmi * bmi_increase)
            base_amounts.append(base_amount)
            
    return reg, sorted(bmi_constants)[round(len(bmi_constants)/2)], base_amounts[0]


def organize_bmi_data(sex):    
    regions = set(region)
    data = {}
    for area in regions:
        reg, constant, base = base_costs_by_region_and_bmi(area,sex)
        data[reg] = {"constant": constant, "base": base}
    return data

male_organized_data = organize_bmi_data('male')
print(male_organized_data)
mswadd = round(20.0 * organized_data['southeast']['constant'],2)
mswbase = organized_data['southeast']['base']

female_organized_data = organize_bmi_data('female')
print(female_organized_data)
fswadd = round(20.0 * organized_data['southeast']['constant'],2)
fswbase = organized_data['southeast']['base']

print(f'''
This algorithm concludes with charge increase constant for BMI with the only other factor being region, the bmi 
constant of {bmi_constant[0]} per 0.1 increase in BMI. For example an 18 year old with a BMI of 20.0 with have an
additional {mswadd} for a total of charge of {mswadd + mswbase}. Furthermore what this data also concludes is an 
difference in base charge by region, the northeast: {round(organized_data['northeast']['base'], 2)} being the most
while the southeast: {round(organized_data['southeast']['base'], 2)} being the least, overall the north showing higher
base charges overall, one could study into economics of these areas to better understand why.
''')

{'southwest': {'constant': 1.39, 'base': 1214.043}, 'northwest': {'constant': 1.39, 'base': 1597.0430000000001}, 'northeast': {'constant': 1.39, 'base': 1672.6119999999999}, 'southeast': {'constant': 1.39, 'base': 1089.612}}
{'southwest': {'constant': 1.39, 'base': 1703.0430000000001}, 'northwest': {'constant': 1.39, 'base': 2086.043}, 'northeast': {'constant': 1.39, 'base': 2161.612}, 'southeast': {'constant': 1.39, 'base': 1578.612}}

This algorithm concludes with charge increase constant for BMI with the only other factor being region, the bmi 
constant of 1.39 per 0.1 increase in BMI. For example an 18 year old with a BMI of 20.0 with have an
additional 27.8 for a total of charge of 1117.412. Furthermore what this data also concludes is an 
difference in base charge by region, the northeast: 1672.61 being the most
while the southeast: 1089.61 being the least, overall the north showing higher
base charges overall, one could study into economics of these areas to better understand wh