# U.S. Medical Insurance Costs
In this project, a CSV file with medical insurance costs will be investigated using Python fundamentals. The goal of this project will be to analyse various attributes within **insurance.csv** to learn more about the patient information in the file and gain insight into potential use cases for the dataset.

In [5]:
# import csv library
import csv

There are other potential libraries that could help with this project; however, for this analysis, I will be only using the csv module

**insurance.csv** contains the following columns:
* Patient Age
* Patient Sex 
* Patient BMI
* Patient Number of Children
* Patient Smoking Status
* Patient U.S Geopraphical Region
* Patient Yearly Medical Insurance Cost

There are no signs of missing data.

In [9]:
# Loading data from insurance.csv to Python using a function
def load_csv_data(csv_file):
    data_dict = {} # dictionary to store all column names as keys and lists as values
    with open(csv_file, newline='') as csv_info:
        csv_dict = csv.DictReader(csv_info)

        #initialise empty lists to store data in columns
        for column in csv_dict.fieldnames:
            data_dict[column] = []

        #read and store data in respective lists
        for row in csv_dict:
            for column in csv_dict.fieldnames:
                data_dict[column].append(row[column])

    #convert numeric columns to float
        numeric_columns = ['age', 'bmi', 'children', 'charges']
        for column in numeric_columns:
            if column in data_dict: # ensure column exist
                data_dict[column] = [float(x) for x in data_dict[column]]

        return data_dict # return dictionary with all column names
                
# load data    
insurance_data = load_csv_data('insurance.csv')

# accessing specific columns
ages = insurance_data['age']
sexes = insurance_data['sex']
bmis = insurance_data['bmi']
num_children = insurance_data['children']
smoker_statuses = insurance_data['smoker']
regions = insurance_data['region']
insurance_charges = insurance_data['charges']

# Classes and Functions

## Class with Descriptive Statistics
Finding summary statistics

In [13]:
class DescriptiveStats:
# Finding the average, min, max and median
    def __init__(self, data, name):
        self.data = data
        self.name = name #name of the dataset, e.g., 'Age', 'BMI'

    def find_avg(self):
        return sum(self.data) / len(self.data)

    def find_min(self):
        return min(self.data)

    def find_max(self):
        return max(self.data)

    def find_median(self):
        sorted_data = sorted(self.data)
        mid = len(sorted_data) // 2
        if len(sorted_data) % 2 == 0: 
            return (sorted_data[mid-1] + sorted_data[mid]) / 2
        return sorted_data[mid]

# Defining a function that will produce a report
    def report(self):
        avg = self.find_avg()
        min_value = self.find_min()
        max_value = self.find_max()
        median = self.find_median()

        print(f"Report for {self.name} descriptive statistics: ")
        print(f"Average {self.name}: {avg:.2f}")  # 2 decimal places
        print(f"Minimum {self.name}: {min_value}")
        print(f"Maximum {self.name}: {max_value}")
        print(f"Median {self.name}: {median}")

age_stats = DescriptiveStats(ages, "Age")
age_stats.report()

bmi_stats = DescriptiveStats(bmis, "BMI")
bmi_stats.report()

children_stats = DescriptiveStats(num_children, "Number of Children")
children_stats.report()

charges_stats = DescriptiveStats(insurance_charges, "Insurance Charges")
charges_stats.report()

Report for Age descriptive statistics: 
Average Age: 39.21
Minimum Age: 18.0
Maximum Age: 64.0
Median Age: 39.0
Report for BMI descriptive statistics: 
Average BMI: 30.66
Minimum BMI: 15.96
Maximum BMI: 53.13
Median BMI: 30.4
Report for Number of Children descriptive statistics: 
Average Number of Children: 1.09
Minimum Number of Children: 0.0
Maximum Number of Children: 5.0
Median Number of Children: 1.0
Report for Insurance Charges descriptive statistics: 
Average Insurance Charges: 13270.42
Minimum Insurance Charges: 1121.8739
Maximum Insurance Charges: 63770.42801
Median Insurance Charges: 9382.033


## Class with Groups 
To analyze how different factors influence insurance costs, we group patients by age, sex, smoking status, region, and number of children

In [16]:
class Groups:
    # Grouping by age, sex, number of children, smoker status and region
    def __init__(self, data, name):
        self.data = data
        self.name = name #name of the dataset

    def group_by_age(self):
        age_groups = {"18-19": [], "20-29": [], "30-39": [], "40-49": [], "50-59": [], "60+": []}
        for age in self.data:
            if age >= 18 and age <= 19: age_groups["18-19"].append(age),
            elif age >= 20 and age <= 29: age_groups["20-29"].append(age),
            elif age >= 30 and age <= 39: age_groups["30-39"].append(age),
            elif age >= 40 and age <= 49: age_groups["40-49"].append(age),
            elif age >= 50 and age <= 59: age_groups["50-59"].append(age),
            else: age_groups["60+"].append(age)
                
        return age_groups

    
    def group_by_sex(self):
        sex_groups = {"Female": [], "Male": []}
        for sex in self.data:
            if sex.lower() == 'female': 
                sex_groups["Female"].append(sex)
            elif sex.lower() == 'male': 
                sex_groups["Male"].append(sex)
                
        return sex_groups

    
    def group_by_num_children(self):
        children_groups = {"0": [], "1": [], "2": [], "3+": []}
        for num_children in self.data:
            if num_children == 0:
                children_groups["0"].append(num_children)
            elif num_children == 1:
                children_groups["1"].append(num_children)
            elif num_children == 2:
                children_groups["2"].append(num_children)
            else:
                children_groups["3+"].append(num_children)

        return children_groups


    def group_by_smoker_status(self):
        smoker_groups = {"Smoker": [], "Non-Smoker": []}
        for item in self.data:
            if item == "yes": smoker_groups["Smoker"].append(item)                              
            else: smoker_groups["Non-Smoker"].append(item)  
                
        return smoker_groups


    def group_by_region(self):
        regions = {"NorthEast": [], "NorthWest": [], "SouthEast": [], "SouthWest": []}
        for region in self.data:
            if region == "northeast": 
                regions["NorthEast"].append(region)
            elif region == "northwest": 
                regions["NorthWest"].append(region)
            elif region == "southeast": 
                regions["SouthEast"].append(region)                                                             
            else: 
                regions["SouthWest"].append(region) 
                
        return regions

    
    # Defining a function that will produce a report
    def report(self, group_function):
        # print a summary of group sizes
        grouped_data = group_function()
        print(f"\nGrouping Report for {self.name}:")
        for group, items in grouped_data.items():
            print(f"{group}: {len(items)} people")

age_groups = Groups(ages, "Age")
age_data = age_groups.group_by_age() #dictionary name
age_groups.report(age_groups.group_by_age)

sex_groups = Groups(sexes, "Sex")
sex_data = sex_groups.group_by_sex() # dictionary name
sex_groups.report(sex_groups.group_by_sex)

smoker_groups = Groups(smoker_statuses, "Smoker Status")
smoking_data = smoker_groups.group_by_smoker_status() # dictionary name
smoker_groups.report(smoker_groups.group_by_smoker_status)

children_groups = Groups(num_children, "Number of Children")
children_data = children_groups.group_by_num_children()
children_groups.report(children_groups.group_by_num_children)

regions = Groups(regions, "Region")
regions_data = regions.group_by_region() # dictionary name
regions.report(regions.group_by_region)



Grouping Report for Age:
18-19: 137 people
20-29: 280 people
30-39: 257 people
40-49: 279 people
50-59: 271 people
60+: 114 people

Grouping Report for Sex:
Female: 662 people
Male: 676 people

Grouping Report for Smoker Status:
Smoker: 274 people
Non-Smoker: 1064 people

Grouping Report for Number of Children:
0: 574 people
1: 324 people
2: 240 people
3+: 200 people

Grouping Report for Region:
NorthEast: 324 people
NorthWest: 325 people
SouthEast: 364 people
SouthWest: 325 people


# Analysis
## Drawing insights

There are a number of ways in which this dataset could be further explored. I will focus on answering three basic questions and one a little bit more advanced:

First, I create a dictionary with all patients' data based on their ID numbers.

**Basic questions**:
- Where is the majority of people in this dataset from?
- What is the difference in insurance costs between smokers and non-smokers?
- What is the average age of someone who has at least 1 child?

**Additional question**:
- What region has the lowest insurance charges for a young female (aged 20-29)?

### Create a dictionary with all patients' data based on their ID numbers

In [20]:
patients_data = {}

# Using row index as the unique patient ID
for i in range(len(insurance_data['age'])):
    patient_id = i + 1  # add 1 so the IDs start from 1, not 0
    patients_data[patient_id] = {
        'age': insurance_data['age'][i],
        'sex': insurance_data['sex'][i],
        'bmi': insurance_data['bmi'][i],
        'children': insurance_data['children'][i],
        'smoker': insurance_data['smoker'][i],
        'region': insurance_data['region'][i],
        'charges': insurance_data['charges'][i]
    }

# To print the data for the patient with ID 1:
print(patients_data[1])

# To print all patients' data
import pprint

# Uncomment the line below to see the full list: 
#pprint.pprint(patients_data)

{'age': 19.0, 'sex': 'female', 'bmi': 27.9, 'children': 0.0, 'smoker': 'yes', 'region': 'southwest', 'charges': 16884.924}


### Where is the majority of people in this dataset from?

In [23]:
# regions_data - dictionary was already created 

# Finding the regions with the majority of people
majority_region = max(regions_data, key=lambda region: len(regions_data[region]))

# Getting the number of people:
majority_count = len(regions_data[majority_region])

# Printing the result
print(f"The majority of poeple in this dataset are from the {majority_region}, with {majority_count} people.")

The majority of poeple in this dataset are from the SouthEast, with 364 people.


### What is the difference in insurance costs between smokers and non-smokers?

In [26]:
# smoking_data - this dictionary was already created in Groups class

smoker_status = smoking_data

# Extracting the insurance charges that correspond to smokers and non-smokers
smokers_charges = []
non_smokers_charges = []

for i in range(len(smoker_statuses)):
    if smoker_statuses[i] == "yes":  # if the person is a smoker
        smokers_charges.append(insurance_charges[i])  # add their charge to the smokers list
    else:
        non_smokers_charges.append(insurance_charges[i])  # add their charge to the non-smokers list


# Calculating the average charge for smokers
avg_smoker_charge = sum(smokers_charges) / len(smokers_charges)

# Calculating the average charge for non-smokers
avg_non_smoker_charge = sum(non_smokers_charges) / len(non_smokers_charges)

# Calculating the difference
charge_difference = avg_smoker_charge - avg_non_smoker_charge

# Printing the results
print(f"Average charge for smokers: {avg_smoker_charge:.2f}")
print(f"Average charge for non-smokers: {avg_non_smoker_charge:.2f}")
print(f"Difference in charges: {charge_difference:.2f}")

Average charge for smokers: 32050.23
Average charge for non-smokers: 8434.27
Difference in charges: 23615.96


### What is the average age of someone who has at least 1 child?

In [29]:
# Accessing ages of people who have at least one child using the dictionary with all patients' data
def get_age_with_children(patients_data):
   return [patient['age'] for patient in patients_data.values() if patient['children'] >= 1]

# Passing the result to DescriptiveStats class to calculate the average
# 1. Getting the ages of patients with at least one child from the function above
ages_with_children = get_age_with_children(patients_data)

# 2. Creating an instance of DescriptiveStats using the filtered data 
age_stats_with_children = DescriptiveStats(ages_with_children, "Age of Patients with At Least One Child")

# 3. Calling the average function
average_age = age_stats_with_children.find_avg()

# 4. Printing the result
print(f"Average age of patients with at least one child: {average_age:.2f}")


Average age of patients with at least one child: 39.78


_**Improvement suggestion**: Make this function reusable, so that the average age could be easily calculated for other conditions._ 

### What region has the lowest insurance charges for a young female (aged 20-29)?
This analysis can help understand which regions have lower healthcare costs for young females, possibly due to regional health trends or cost of care.

In [33]:
# Filtering the data for young females aged 20-29
young_females_data = [
    patient for patient in patients_data.values()
    if patient['sex'] == 'female' and 20 <= patient['age'] <= 29
]

# Grouping the data by region
region_charges = {}
for patient in young_females_data:
    region = patient['region']
    charge = patient['charges']
    
    # Grouping charges by region
    if region not in region_charges:
        region_charges[region] = []
    region_charges[region].append(charge)

# Calculating the average charges for each region
average_charges = {region: sum(charges) / len(charges) for region, charges in region_charges.items()}

# Finding the region with the lowest average charge
lowest_charge_region = min(average_charges, key=average_charges.get)

print(f"The region with the lowest insurance charges for young females (aged 20-29) is: {lowest_charge_region}")


The region with the lowest insurance charges for young females (aged 20-29) is: northwest


_**Improvement suggestion**: Using Groups Class to group by region_