# U.S. Medical Insurance Costs

#### Requirements
 * Create a Patient Class with methods that return their information
 * Create a Patient Dictionary

#### Goals

1. What is the average age of the patients in this dataset?
2. What is the difference in average insurance costs between:
    * males & females
    * smokers & non-smokers
    * age groups
    * regions
3. Which region has the highest average BMI?
4. What is the average insurance cost by number of children?

#### Data Prep

In [65]:
# Import and load columns as Python lists

import csv

age = []
sex = []
bmi = []
children = []
smoker = []
region = []
charges = []
data_list = [age, sex, bmi, children, smoker, region, charges]

# opening insurance.csv file
with open('insurance.csv') as ins_data:
    # reading insurance.csv
    ins_data_output = csv.DictReader(ins_data)
    # iterating through rows and appending row data to each parameter list
    for data in ins_data_output:
        age.append(int(data['age']))
        sex.append(data['sex'])
        bmi.append(float(data['bmi']))
        children.append(data['children'])
        smoker.append(data['smoker'])
        region.append(data['region'])
        charges.append(float(data['charges']))

# Total number of rows: 1338
print("Total number of rows:", len(age))

#Check type
for data in data_list:
    print(type(data[0]))

# 1 int; 5 str; 1 float


Total number of rows: 1338
<class 'int'>
<class 'str'>
<class 'float'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'float'>


##### Create Patient Class

Creating a Patient class with the following methods to easily retrieve information:
 - `average_ages()`
 - `analyze_sexes()`
 - `analyze_regions()`
 - `average_charges()`
 - `analyze_smokers()`
 - `cost_by_children()`

In [66]:
class Patient:
    # initializing class and adding in parameters
    def __init__(self, age, sex, bmi, children, smoker, region, charges):
        self.age = age
        self.sex = sex
        self.bmi = bmi
        self.children = children
        self.smoker = smoker
        self.region = region
        self.charges = charges
    
    # finding the average age of patients in the dataset
    def average_ages(self):
        # set the total amount to 0
        total_age = 0
        # iterate through the ages in the age list
        for age in self.age:
            # add each age to the total
            total_age += age
        # assigning average age to a variable
        average_age = total_age / len(self.age)
        # return average age
        return "Patient average age: {:.0f}".format(average_age)
    
    # finding the total number of males and females
    def analyze_sexes(self):
        # initializing counters as 0
        males = 0
        females = 0
        # iterating through data in the sex list
        for sex in self.sex:
            # increasing counter for each sex
            if sex == 'male':
                males += 1
            else:
                females += 1
        # calculating for the percentage share of each sex
        males_perc = (males / (males + females)) * 100
        females_perc = (females / (males + females)) * 100
        # printing the total count and percentage share for males and females
        return "The number of males are: {}, or {:.1f}% of the patients\nThe number of females are: {}, or {:.1f}% of the patients".format(males, males_perc, females, females_perc)

    # finding the total number of patients in each region
    def analyze_regions(self):
        # initializing counters as dictionary keys
        region_dict = {'southwest': 0, 'southeast': 0, 'northwest': 0, 'northeast': 0}
        # iterating through rows in the region list
        for reg in self.region:
            # creating condition to find the data's matching key in region_dict
            if reg in region_dict.keys():
                # adding to appropriate key counter
                region_dict[reg] += 1
        # return dictionary with total amounts
        return "The breakdown per region is as follows: " + str(region_dict)

    # finding the average insurance cost for the dateset
    def average_charges(self):
        # set the total amount to 0
        total_charges = 0
        # iterate through each charge in charges list
        for charge in self.charges:
            # add each cost to the total amount
            total_charges += charge
        # calculate for average insurance cost
        average_charges = total_charges / len(self.charges)
        # return average cost
        return "Patient average insurance cost: {:.2f}".format(average_charges)

        
    # create method to count number of smokers and non-smokers
    def analyze_smokers(self):
        # initialize counters to 0
        smokers = 0
        non_smokers = 0
        # iterate through each patient in the smoker list
        for patient in self.smoker:
            # create conditional to update the appropriate counters
            if patient == 'yes':
                smokers += 1
            else:
                non_smokers += 1
        dict = {'Smokers': smokers, 'Non-smokers': non_smokers}
        # return results as total number of smokers and non-smokers
        return dict

    # finding average insurance cost using number of children as categories 
    def cost_by_children(self):
        # extracting unique values using set()
        category = sorted(set(self.children))
        # creating dictionary with number of children as keys and count, total insurance cost as values
        dict = {x: [0, 0] for x in category}
        # iterating through the children list to update dictionary counter
        for children, charge in zip(self.children, self.charges):
            dict[children][0] += 1
            dict[children][1] += int(charge)
        # iterating through dictionary values to get average
        for values in dict.keys():
            dict[values] = '$' + str(int(dict[values][1] / dict[values][0]))
        # return total for each number of children
        return dict
        





#### Analysis


In [67]:
# Instantiate the Patient class
patient_info = Patient(age, sex, bmi, children, smoker, region, charges)


Average Age


In [68]:
# Using class method to find the average age of patients in the insurance.csv dataset

patient_info.average_ages()


'Patient average age: 39'

Difference in average insurance costs between males and females

In [69]:
# Determining the gender distribution in the dataset

print(patient_info.analyze_sexes())
# Gender ratio is almost 50-50

# Initializing variables required for analysis
male_count = 0
female_count = 0
male_charges = 0
female_charges = 0

# Iterating through the sex and charges list, counting by category and charges
for x, y in zip(sex, charges):
    if x == 'male':
        male_count += 1
        male_charges += y
    else:
        female_count += 1
        female_charges += y

# Calculating average amounts for each sex
avg_males = male_charges / male_count
avg_females = female_charges / female_count

print("Average insurance cost for males: ${:.2f}".format(avg_males))
print("Average insurance cost for females: ${:.2f}".format(avg_females))

# Calculating difference
if avg_males > avg_females:
    print("Average insurance costs for males are higher by ${:.2f}.".format(avg_males - avg_females))
elif avg_females == avg_males:
    print("Average insurance costs for males and females are equal.")
else:
    print("Average insurance costs for females are higher by ${:.2f}.".format(avg_females - avg_males))


The number of males are: 676, or 50.5% of the patients
The number of females are: 662, or 49.5% of the patients
Average insurance cost for males: $13956.75
Average insurance cost for females: $12569.58
Average insurance costs for males are higher by $1387.17.


Difference in average insurance costs between smokers and non-smokers

In [70]:
# Use class method to count total number of smokers and non-smokers
patient_info.analyze_smokers()


{'Smokers': 274, 'Non-smokers': 1064}

Difference in average insurance costs between age groups

In [71]:
# Create a function that takes group size as an argument and returns age groups from 0-100 
def create_age_group(size):
    # Create list 0-100 incremented by size
    group = list(range(0, 101, size))
    age_group = []
    #iterate through list and append the age groups
    for i in range(len(group) - 1):
        age_group.append(range(group[i] + 1, group[i + 1] + 1))
    if 100 % size != 0:
        age_group.append(range(100 - (100 % size), 101))
    age_group_dict = {x: 0 for x in age_group}
    return age_group_dict

create_age_group(20)

{range(1, 21): 0,
 range(21, 41): 0,
 range(41, 61): 0,
 range(61, 81): 0,
 range(81, 101): 0}

Age groups: size 10

In [72]:
age_group = create_age_group(10)

age_group_count = age_group

# Get total counts per age group
for x in age:
    for y in age_group_count.keys():
        if x in y:
            age_group_count[y] += 1

# Get total charges per age group
age_group_cost = age_group

for x, y in zip(age, charges):
    for z in age_group_count.keys():
        if x in z:
            age_group_cost[z] += int(y)

# Get average
avg_cost = {x[0]: int(x[1] / y[1]) for x, y in zip(age_group_cost.items(), age_group_count)}

avg_cost

{range(1, 11): 0,
 range(11, 21): 120543,
 range(21, 31): 123918,
 range(31, 41): 93482,
 range(41, 51): 98902,
 range(51, 61): 86954,
 range(61, 71): 30916,
 range(71, 81): 0,
 range(81, 91): 0,
 range(91, 101): 0}

Age groups: size 15

In [73]:
# Using the same for loops as above
age_group = create_age_group(15)

age_group_count = age_group

# Get total counts per age group
for x in age:
    for y in age_group_count.keys():
        if x in y:
            age_group_count[y] += 1

# Get total charges per age group
age_group_cost = age_group

for x, y in zip(age, charges):
    for z in age_group_count.keys():
        if x in z:
            age_group_cost[z] += int(y)

# Get average
avg_cost = {x[0]: int(x[1] / y[1]) for x, y in zip(age_group_cost.items(), age_group_count)}

avg_cost

{range(1, 16): 0,
 range(16, 31): 245454,
 range(31, 46): 155728,
 range(46, 61): 142205,
 range(61, 76): 30916,
 range(76, 91): 0,
 range(90, 101): 0}

Difference in average insurance costs between regions

In [74]:
# Using class method, get the total count of patients per region
patient_info.analyze_regions()

region_count = {'southwest': 325, 'southeast': 364, 'northwest': 325, 'northeast': 324}

region_charges = {'southwest': 0, 'southeast': 0, 'northwest': 0, 'northeast': 0}
for reg, charge in zip(region, charges):
    region_charges[reg] += charge

region_avg = {'southwest': 0, 'southeast': 0, 'northwest': 0, 'northeast': 0}

for x, y in zip(region_charges.items(), region_count.values()):
    region_avg[x[0]] = int(x[1] // y)

region_avg

{'southwest': 12346,
 'southeast': 14735,
 'northwest': 12417,
 'northeast': 13406}

Region with highest average BMI

In [75]:
# Used variale region_count from previous analysis to determine total BMIs per region
region_bmis = {'southwest': 0, 'southeast': 0, 'northwest': 0, 'northeast': 0}
for reg, bmis in zip(region, bmi):
    region_bmis[reg] += bmis

avg_bmis = {'southwest': 0, 'southeast': 0, 'northwest': 0, 'northeast': 0}

for x, y in zip(region_bmis.items(), region_count.values()):
    avg_bmis[x[0]] = int(x[1] // y)

#Region with highest average BMI is the Southeast region
avg_bmis

{'southwest': 30, 'southeast': 33, 'northwest': 29, 'northeast': 29}

Average insurance cost by number of children

In [76]:
patient_info.cost_by_children()

{'0': '$12365',
 '1': '$12730',
 '2': '$15073',
 '3': '$15354',
 '4': '$13850',
 '5': '$8785'}