# U.S. Medical Insurance Costs

This project works with the U.S. Medical Insurance Costs data from Kaggle.
Here I identify all the necessary variables for analysis and sort the data into individual categories. 

In [1]:
#import csv module to work with the csv data file:
import csv

In [2]:
#create empty lists to class out informations later on:
ages = []
sexes = []
bmis = []
childrens = []
smoker_status = []
regions = []
charges = []

In [3]:
#acquire the data from the csv file to the empty lists:
def data_acquisition(csv_file, data_name, data_list):
    with open (csv_file) as csv_data:
        csv_dict = csv.DictReader(csv_data)
        for row in csv_dict: 
            data_list.append(row[data_name])
    return data_list

In [4]:
#see each list of data
print(data_acquisition('insurance.csv', 'age', ages))
print(data_acquisition('insurance.csv', 'sex', sexes))
print(data_acquisition('insurance.csv', 'bmi', bmis))
print(data_acquisition('insurance.csv', 'children', childrens))
print(data_acquisition('insurance.csv', 'smoker', smoker_status))
print(data_acquisition('insurance.csv', 'region', regions))
print(data_acquisition('insurance.csv', 'charges', charges))

['19', '18', '28', '33', '32', '31', '46', '37', '37', '60', '25', '62', '23', '56', '27', '19', '52', '23', '56', '30', '60', '30', '18', '34', '37', '59', '63', '55', '23', '31', '22', '18', '19', '63', '28', '19', '62', '26', '35', '60', '24', '31', '41', '37', '38', '55', '18', '28', '60', '36', '18', '21', '48', '36', '40', '58', '58', '18', '53', '34', '43', '25', '64', '28', '20', '19', '61', '40', '40', '28', '27', '31', '53', '58', '44', '57', '29', '21', '22', '41', '31', '45', '22', '48', '37', '45', '57', '56', '46', '55', '21', '53', '59', '35', '64', '28', '54', '55', '56', '38', '41', '30', '18', '61', '34', '20', '19', '26', '29', '63', '54', '55', '37', '21', '52', '60', '58', '29', '49', '37', '44', '18', '20', '44', '47', '26', '19', '52', '32', '38', '59', '61', '53', '19', '20', '22', '19', '22', '54', '22', '34', '26', '34', '29', '30', '29', '46', '51', '53', '19', '35', '48', '32', '42', '40', '44', '48', '18', '30', '50', '42', '18', '54', '32', '37', '47', '20

In [5]:
#number of patients:
num_data = len(ages)
print('Total number of patients surveyed: ' + str(num_data))

Total number of patients surveyed: 1338


In [6]:
#add class for data analysis
class Patients_Data:

    
    #initialize the methods
    def __init__(self, age, sex, bmi, children, smoker, region, charges):
        self.age = age
        self.sex = sex
        self.bmi = bmi
        self.children = children
        self.smoker = smoker
        self.region = region
        self.charges = charges

        
    #method to calculate the average age of all surveyed patients    
    def age_info(self):
        total_age = 0
        for age in self.age:
            total_age += int(age)
        average_age = total_age / num_data
        print('Average age of surveyed patients: ' + str(round(average_age, 0)) + ' years old')

    
    #method to calculate the number of male and female surveyed patients
    def sex_info(self):
        num_male = 0
        num_female = 0
        for sex in self.sex:
            if sex == 'male':
                num_male += 1
            if sex == 'female':
                num_female += 1
        print('Number of male patients: ' + str(num_male))
        print('Number of female patients: ' + str(num_female))
    
    
    #method to calculate the number of patients in each BMI range
    def bmi_info(self):
        num_underweight = 0
        num_healthyweight = 0
        num_overweight = 0
        num_obese = 0
        for bmi in self.bmi:
            if float(bmi) < 18.5:
                num_underweight += 1
            if float(bmi) >= 18.5 and float(bmi) < 25:
                num_healthyweight += 1
            if float(bmi) >= 25 and float(bmi) < 30:
                num_overweight += 1
            if float(bmi) >= 30:
                num_obese += 1
        print('Underweight patients: ' + str(num_underweight))
        print('Healthy weight patients: ' + str(num_healthyweight))
        print('Overweight patients: ' + str(num_overweight))
        print('Obese patients: ' + str(num_obese))
    
    
    #method to calculate the number of smokers and non-smokers
    def smoker_info(self):
        num_smoker = 0
        num_nonsmoker = 0
        for smoker in self.smoker:
            if smoker == 'yes':
                num_smoker += 1
            if smoker == 'no':
                num_nonsmoker += 1
        print('Number of smokers: ' + str(num_smoker))
        print('Number of non-smokers: ' + str(num_nonsmoker))
        
    
    #method to calculate and find out the different regions the patients came from
    def region_info(self):
        unique_regions = []
        for region in self.region:
            if not region in unique_regions:
                unique_regions.append(region)
        num_of_regions = len(unique_regions)
        print('Number of different regions: ' + str(num_of_regions))
        print('All surveyed patients came from: ' + str(unique_regions))
        
    
    #method to calculate the average charges for patients surveyed
    def charges_info(self):
        total_charge = 0
        for charge in self.charges:
            total_charge += float(charge)
        average_charge = total_charge / num_data
        print('Average charge for health insurance: ' + str(round(average_charge, 2)) + ' dollars')

In [7]:
#call the data for analysis

#instantiate the data
patients_avg_age = Patients_Data(ages, sexes, bmis, childrens, smoker_status, regions, charges)
patients_sex_info = Patients_Data(ages, sexes, bmis, childrens, smoker_status, regions, charges)
patients_bmi_info = Patients_Data(ages, sexes, bmis, childrens, smoker_status, regions, charges)
patients_smoker_info = Patients_Data(ages, sexes, bmis, childrens, smoker_status, regions, charges)
patients_region_info = Patients_Data(ages, sexes, bmis, childrens, smoker_status, regions, charges)
patients_charge_info = Patients_Data(ages, sexes, bmis, childrens, smoker_status, regions, charges)

#call the data
patients_avg_age.age_info()
patients_sex_info.sex_info()
patients_bmi_info.bmi_info()
patients_smoker_info.smoker_info()
patients_region_info.region_info()
patients_charge_info.charges_info()

Average age of surveyed patients: 39.0 years old
Number of male patients: 676
Number of female patients: 662
Underweight patients: 20
Healthy weight patients: 225
Overweight patients: 386
Obese patients: 707
Number of smokers: 274
Number of non-smokers: 1064
Number of different regions: 4
All surveyed patients came from: ['southwest', 'southeast', 'northwest', 'northeast']
Average charge for health insurance: 13270.42 dollars


In [8]:
#create a dictionary to store the information of each individual patient

#change the smoker status from yes/no to smoker/non-smoker for easy access
smoker_status_new = []
for smoker in smoker_status:
    if smoker == 'yes':
        smoker_status_new.append('smoker')
    if smoker == 'no':
        smoker_status_new.append('non-smoker')

#create and id for each patient and give the ids the respective values
ids = list(range(1, 1339, 1))
patients_data = {}
for i in list(range(0, 1338, 1)):
    patients_data[str(ids[i])] = [ages[i], sexes[i], bmis[i], childrens[i], smoker_status_new[i], regions[i], charges[i]]
print(patients_data)

{'1': ['19', 'female', '27.9', '0', 'smoker', 'southwest', '16884.924'], '2': ['18', 'male', '33.77', '1', 'non-smoker', 'southeast', '1725.5523'], '3': ['28', 'male', '33', '3', 'non-smoker', 'southeast', '4449.462'], '4': ['33', 'male', '22.705', '0', 'non-smoker', 'northwest', '21984.47061'], '5': ['32', 'male', '28.88', '0', 'non-smoker', 'northwest', '3866.8552'], '6': ['31', 'female', '25.74', '0', 'non-smoker', 'southeast', '3756.6216'], '7': ['46', 'female', '33.44', '1', 'non-smoker', 'southeast', '8240.5896'], '8': ['37', 'female', '27.74', '3', 'non-smoker', 'northwest', '7281.5056'], '9': ['37', 'male', '29.83', '2', 'non-smoker', 'northeast', '6406.4107'], '10': ['60', 'female', '25.84', '0', 'non-smoker', 'northwest', '28923.13692'], '11': ['25', 'male', '26.22', '0', 'non-smoker', 'northeast', '2721.3208'], '12': ['62', 'female', '26.29', '0', 'smoker', 'southeast', '27808.7251'], '13': ['23', 'male', '34.4', '0', 'non-smoker', 'southwest', '1826.843'], '14': ['56', 'fem