# U.S. Medical Insurance Costs

In [1]:
# U.S. Medical Insurance Costs

In [2]:
# Import necessary python libraries
import os
import csv
import numpy  as np
import pandas as pd
from typing import Dict, Any

In [3]:
# Load up dataset from csv file
df = pd.read_csv('insurance.csv')

In [4]:
# Convert numeric columns from strings to appropriate types
df['age'] = df['age'].astype(int)
df['bmi'] = df['bmi'].astype(float)
df['children'] = df['children'].astype(int)
df['charges'] = df['charges'].astype(float)

# Basic statistics
total_num_of_patients = len(df)
youngest_patient = df['age'].min()
oldest_patient = df['age'].max()

# Smokers and non-smokers
total_num_of_smokers = (df['smoker'] == 'yes').sum()
total_num_of_nonsmokers = (df['smoker'] == 'no').sum()

diff_in_total_smokers_nonsmokers = total_num_of_smokers - total_num_of_nonsmokers

# With children vs without children
total_num_with_children = (df['children'] > 0).sum()
total_num_without_children = (df['children'] == 0).sum()
diff_in_total_with_without_children = total_num_with_children - total_num_without_children

# Male vs Female patients
num_of_male_patients = (df['sex'] == 'male').sum()
num_of_female_patients = (df['sex'] == 'female').sum()
diff_in_males_to_females = num_of_male_patients - num_of_female_patients

# Male and Female Smokers
num_of_male_smokers = df[(df['sex'] == 'male') & (df['smoker'] == 'yes')].shape[0]
num_of_female_smokers = df[(df['sex'] == 'female') & (df['smoker'] == 'yes')].shape[0]
diff_in_male_to_female_smokers = num_of_male_smokers - num_of_female_smokers

# Male and Female with children
num_of_males_with_children = df[(df['sex'] == 'male') & (df['children'] > 0)].shape[0]
num_of_females_with_children = df[(df['sex'] == 'female') & (df['children'] > 0)].shape[0]
diff_in_male_to_female_with_children = num_of_males_with_children - num_of_females_with_children

# Print results
print(f"Total number of patients: {total_num_of_patients}")
print(f"Patients ages range from: {youngest_patient} - {oldest_patient}\n")

print(f"There are {num_of_male_patients} male patients.")
print(f"{num_of_male_smokers} of those males are smokers and {num_of_males_with_children} of them have at least 1 child.\n")

print(f"There are {num_of_female_patients} female patients.")
print(f"{num_of_female_smokers} of those females are smokers and {num_of_females_with_children} of them have at least 1 child.\n")


Total number of patients: 1338
Patients ages range from: 18 - 64

There are 676 male patients.
159 of those males are smokers and 391 of them have at least 1 child.

There are 662 female patients.
115 of those females are smokers and 373 of them have at least 1 child.



In [5]:
# Averages
age_average = df['age'].mean()
smoker_age_average = df[df['smoker'] == 'yes']['age'].mean()
nonsmoker_age_average = df[df['smoker'] == 'no']['age'].mean()
with_children_age_average = df[df['children'] > 0]['age'].mean()
no_children_age_average = df[df['children'] == 0]['age'].mean()

# Round and print
print(f"Average age of patients overall: {int(age_average)}\n")
print(f"Average age of patients that smoke: {int(smoker_age_average)}")
print(f"Average age of patients that don't smoke: {int(nonsmoker_age_average)}\n")
print(f"Average age of patients with at least 1 child: {int(with_children_age_average)}")
print(f"Average age of patients with 0 children: {int(no_children_age_average)}")

Average age of patients overall: 39

Average age of patients that smoke: 38
Average age of patients that don't smoke: 39

Average age of patients with at least 1 child: 39
Average age of patients with 0 children: 38


In [6]:
# Calculate total and average insurance costs overall
total_insurance_cost = df['charges'].sum()
average_total_cost = df['charges'].mean()

# Rounding
rounded_total_cost = round(total_insurance_cost, 2)
rounded_average_cost = round(average_total_cost, 2)

# Print results
print(f"The total amount spent on insurance is: ${rounded_total_cost}")
print(f"The average insurance cost of overall costs is: ${rounded_average_cost}")

The total amount spent on insurance is: $17755824.99
The average insurance cost of overall costs is: $13270.42


In [7]:
# Group by sex and sum/mean the charges
costs_by_sex = df.groupby('sex')['charges'].agg(['sum', 'mean'])

# Extract values
total_male_cost = costs_by_sex.loc['male', 'sum']
total_female_cost = costs_by_sex.loc['female', 'sum']

average_male_cost = costs_by_sex.loc['male', 'mean']
average_female_cost = costs_by_sex.loc['female', 'mean']

# Differences
difference_in_total_costs = total_male_cost - total_female_cost
difference_in_average_costs = average_male_cost - average_female_cost

# Rounding
rounded_total_male_cost = round(total_male_cost, 2)
rounded_total_female_cost = round(total_female_cost, 2)
rounded_average_male_cost = round(average_male_cost, 2)
rounded_average_female_cost = round(average_female_cost, 2)
rounded_total_difference = round(difference_in_total_costs, 2)
rounded_average_difference = round(difference_in_average_costs, 2)

# Print results
print(f"Total insurance costs for males is: ${rounded_total_male_cost}")
print(f"Total insurance costs for females is: ${rounded_total_female_cost}\n")
print(f"Difference of total costs between males and females is: ${rounded_total_difference}\n")
print(f"Average insurance costs for males is: ${rounded_average_male_cost}")
print(f"Average insurance costs for females is: ${rounded_average_female_cost}\n")
print(f"Difference of average costs between males and females is: ${rounded_average_difference}\n")

Total insurance costs for males is: $9434763.8
Total insurance costs for females is: $8321061.19

Difference of total costs between males and females is: $1113702.6

Average insurance costs for males is: $13956.75
Average insurance costs for females is: $12569.58

Difference of average costs between males and females is: $1387.17



In [8]:
# Group by smoker and sex, then calculate total and average charges
smoker_costs_summary = df.groupby(['smoker', 'sex'])['charges'].agg(total='sum', average='mean').round(2)

# Smokers vs Non-Smokers overall
overall_smoker_costs = df.groupby('smoker')['charges'].agg(total='sum', average='mean').round(2)
print(overall_smoker_costs)
rounded_total_smoker_cost = overall_smoker_costs.loc['yes', 'total']
rounded_total_nonsmoker_cost = overall_smoker_costs.loc['no', 'total']
rounded_average_smoker_cost = overall_smoker_costs.loc['yes', 'average']
rounded_average_nonsmoker_cost = overall_smoker_costs.loc['no', 'average']

# Differences
rounded_smoker_total_difference = round(rounded_total_nonsmoker_cost - rounded_total_smoker_cost, 2)
rounded_smoker_average_difference = round(rounded_average_smoker_cost - rounded_average_nonsmoker_cost, 2)

# Male Smokers vs Male Non-Smokers
total_male_smoker_cost = smoker_costs_summary.loc[('yes', 'male'), 'total']
total_male_nonsmoker_cost = smoker_costs_summary.loc[('no', 'male'), 'total']
average_male_smoker_cost = smoker_costs_summary.loc[('yes', 'male'), 'average']
average_male_nonsmoker_cost = smoker_costs_summary.loc[('no', 'male'), 'average']

rounded_male_smoker_total_difference = round(total_male_smoker_cost - total_male_nonsmoker_cost, 2)
rounded_male_smoker_average_difference = round(average_male_smoker_cost - average_male_nonsmoker_cost, 2)

# Female Smokers vs Female Non-Smokers
total_female_smoker_cost = smoker_costs_summary.loc[('yes', 'female'), 'total']
total_female_nonsmoker_cost = smoker_costs_summary.loc[('no', 'female'), 'total']
average_female_smoker_cost = smoker_costs_summary.loc[('yes', 'female'), 'average']
average_female_nonsmoker_cost = smoker_costs_summary.loc[('no', 'female'), 'average']

rounded_female_smoker_total_difference = round(total_female_nonsmoker_cost - total_female_smoker_cost, 2)
rounded_female_smoker_average_difference = round(average_female_smoker_cost - average_female_nonsmoker_cost, 2)

# Male vs Female Smokers
diff_male_female_smoker_total = round(total_male_smoker_cost - total_female_smoker_cost, 2)
diff_male_female_smoker_average = round(average_male_smoker_cost - average_female_smoker_cost, 2)

# Male vs Female Non-Smokers
diff_male_female_nonsmoker_total = round(total_female_nonsmoker_cost - total_male_nonsmoker_cost, 2)
diff_male_female_nonsmoker_average = round(average_female_nonsmoker_cost - average_male_nonsmoker_cost, 2)

# Print results
print(f"Total smoker costs: ${rounded_total_smoker_cost}")
print(f"Total non-smoker costs: ${rounded_total_nonsmoker_cost}")
print(f"Difference in total costs between smokers and non-smokers: ${rounded_smoker_total_difference}\n")

print(f"Average smoker cost: ${rounded_average_smoker_cost}")
print(f"Average non-smoker cost: ${rounded_average_nonsmoker_cost}")
print(f"Difference in average costs between smokers and non-smokers: ${rounded_smoker_average_difference}\n")

print(f"Male smoker vs male non-smoker total cost difference: ${rounded_male_smoker_total_difference}0")
print(f"Male smoker vs male non-smoker average cost difference: ${rounded_male_smoker_average_difference}\n")

print(f"Female smoker vs female non-smoker total cost difference: ${rounded_female_smoker_total_difference}")
print(f"Female smoker vs female non-smoker average cost difference: ${rounded_female_smoker_average_difference}0\n")

print(f"Male vs Female smoker total cost difference: ${diff_male_female_smoker_total}")
print(f"Male vs Female smoker average cost difference: ${diff_male_female_smoker_average}\n")

print(f"Male vs Female non-smoker total cost difference: ${diff_male_female_nonsmoker_total}")
print(f"Male vs Female non-smoker average cost difference: ${diff_male_female_nonsmoker_average}0\n")

             total   average
smoker                      
no      8974061.47   8434.27
yes     8781763.52  32050.23
Total smoker costs: $8781763.52
Total non-smoker costs: $8974061.47
Difference in total costs between smokers and non-smokers: $192297.95

Average smoker cost: $32050.23
Average non-smoker cost: $8434.27
Difference in average costs between smokers and non-smokers: $23615.96

Male smoker vs male non-smoker total cost difference: $1072594.10
Male smoker vs male non-smoker average cost difference: $24954.81

Female smoker vs female non-smoker total cost difference: $1264892.05
Female smoker vs female non-smoker average cost difference: $21916.70

Male vs Female smoker total cost difference: $1725594.38
Male vs Female smoker average cost difference: $2363.01

Male vs Female non-smoker total cost difference: $611891.77
Male vs Female non-smoker average cost difference: $675.10



In [11]:
DATA_PATH = 'insurance.csv'
RESULTS_PATH = 'children_costs_results.txt'


def load_and_validate_data(filepath: str) -> pd.DataFrame:
    """
    Loads the insurance dataset and validates required columns.
    Handles missing or malformed data.
    """
    try:
        df = pd.read_csv(filepath)
    except Exception as e:
        raise RuntimeError(f"Failed to load data: {e}")

    required_cols = {'children', 'charges'}
    if not required_cols.issubset(df.columns):
        raise ValueError(f"Dataset must contain columns: {required_cols}")

    df = df.dropna(subset=['children', 'charges'])
    df = df[(pd.to_numeric(df['children'], errors='coerce').notnull()) & (pd.to_numeric(df['charges'], errors='coerce').notnull())]
    df['children'] = df['children'].astype(int)
    df['charges'] = df['charges'].astype(float)
    return df


def weighted_analysis(df: pd.DataFrame) -> pd.DataFrame:
    """
    Computes weighted totals and averages per number of children.
    Also computes differences and percentage changes between groups.
    Weights are the group sizes (counts).
    Returns a summary DataFrame.
    """
    grouped = round(df.groupby('children')['charges'].agg(['count', 'sum', 'mean']).reset_index(), 2)
    grouped.rename(columns={'count': 'num_records', 'sum': 'total_cost', 'mean': 'average_cost'}, inplace=True)
    # Calculate differences and percent changes
    grouped['diff_prev'] = grouped['average_cost'].diff().round(2)
    grouped['pct_of_change'] = grouped['average_cost'].pct_change().multiply(100).round(2)
    return grouped

def explain_results(summary: pd.DataFrame) -> str:
    """
    Generates an explanation of what the weighted averages and differences reveal about the data.
    """
    explanation = []
    explanation.append("Analysis Explanation:")
    explanation.append("- The table above shows the total and average insurance costs for each group based on the number of children.")
    explanation.append("- The 'diff_from_prev' column shows how much the average insurance cost changes as the number of children increases by one.")
    explanation.append("- The 'pct_change_from_prev' column shows the percentage change in average cost from the previous group.")
    explanation.append("\nInterpretation:")
    explanation.append("- If the differences are consistently positive and large, it suggests insurance costs rise with more children. If differences are small or fluctuate, the relationship may be weak or non-linear.")
    explanation.append("- Large jumps in certain groups may indicate thresholds where insurers increase rates more steeply.")
    explanation.append("- If some groups show a decrease, it could suggest other factors are at play, or possible discounts for larger families.")
    explanation.append("- This analysis helps identify trends and patterns that may not be obvious from overall averages alone.")
    return '\n'.join(explanation)


def save_results(results: Dict[str, Any], filename: str) -> None:
    """
    Saves the results dictionary and table to a .txt file in a readable format.
    """
    with open(filename, 'w') as f:
        f.write("Insurance Cost Analysis by Number of Children\n")
        f.write("="*50 + "\n\n")
        # Summary Table
        f.write("Weighted Totals, Averages, and Differences per Number of Children:\n")
        f.write(str(results['summary_table']) + "\n\n")
        # Explanation
        f.write(results['explanation'] + "\n")


def main():
    """
    Main function to run the weighted analysis, print, and save results.
    """
    try:
        df = load_and_validate_data(DATA_PATH)
        summary = weighted_analysis(df)
        explanation = explain_results(summary)

        # Prepare results dictionary
        results = {
            'summary_table': summary,
            'explanation': explanation
        }

        # Print results to console
        print("Insurance Cost Analysis by Number of Children")
        print("="*50)
        print("Weighted Totals, Averages, and Differences per Number of Children:")
        print(summary)
        print("\n" + explanation)

        # Save results to file
        save_results(results, RESULTS_PATH)
        print(f"\nResults saved to {RESULTS_PATH}")

    except Exception as e:
        print(f"Error: {e}")


def test_analysis_functions():
    """
    Simple test to verify weighted analysis functions on a small sample DataFrame.
    """
    test_data = pd.DataFrame({
        'children': [0, 1, 2, 2, 3],
        'charges': [1000, 2000, 3000, 4000, 5000]
    })
    summary = weighted_analysis(test_data)
    assert not summary.empty, "Summary table should not be empty."
    print("Test passed: Weighted analysis functions work as expected.")


if __name__ == "__main__":
    main()
    # Uncomment to run test
    # test_analysis_functions()

Insurance Cost Analysis by Number of Children
Weighted Totals, Averages, and Differences per Number of Children:
   children  num_records  total_cost  average_cost  diff_prev  pct_of_change
0         0          574  7098070.00      12365.98        NaN            NaN
1         1          324  4124899.67      12731.17     365.19           2.95
2         2          240  3617655.30      15073.56    2342.39          18.40
3         3          157  2410784.98      15355.32     281.76           1.87
4         4           25   346266.41      13850.66   -1504.66          -9.80
5         5           18   158148.63       8786.04   -5064.62         -36.57

Analysis Explanation:
- The table above shows the total and average insurance costs for each group based on the number of children.
- The 'diff_from_prev' column shows how much the average insurance cost changes as the number of children increases by one.
- The 'pct_change_from_prev' column shows the percentage change in average cost from the pr