<a href="https://colab.research.google.com/github/ihsanmujahid/Codecademy_IHSAN/blob/Medical-Insurance/Medical_Insurance_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd

# Step 1: Data Loading and Exploration
from google.colab import files
uploaded = files.upload()

# The uploaded file is named "insurance.csv"
df = pd.read_csv("insurance.csv")
print(df.head())  # Displaying the first few rows of the dataset


# Function to calculate the average insurance cost for a given category
def calculate_average_cost(category, value):
    average_cost = df[df[category] == value]['charges'].mean()
    return average_cost


# Step 2a: What is the biggest factor that can cause higher insurance cost?
correlation = df[['age', 'bmi', 'children', 'charges']].corr()
biggest_factor = correlation['charges'].drop('charges').idxmax()
print("The biggest factor that can cause higher insurance costs is:", biggest_factor)


# Step 2b: What is the average insurance cost for smokers?
average_smoker_insurance = calculate_average_cost('smoker', 'yes')
print("The average insurance cost for smokers is:", average_smoker_insurance)


# Step 2c: What age has the higher BMI?
average_bmi_by_age = df.groupby('age')['bmi'].mean()
age_with_higher_bmi = average_bmi_by_age.idxmax()
print("The age group with the highest BMI is:", age_with_higher_bmi)


# Additional Questions and Analysis

# Step 3d: Does the number of children have an impact on insurance charges?
def calculate_average_charges_by_children():
    average_charges_by_children = {}
    for num_children in range(6):
        average_charges = calculate_average_cost('children', num_children)
        average_charges_by_children[num_children] = average_charges
    return average_charges_by_children


average_charges_by_children = calculate_average_charges_by_children()
print("Average insurance charges by number of children:")
for num_children, average_charges in average_charges_by_children.items():
    print(f"Number of Children: {num_children}, Average Charges: {average_charges}")


# Step 3e: Are there significant differences in insurance charges based on gender?
def calculate_average_charges_by_gender():
    average_charges_by_gender = {}
    for gender in ['male', 'female']:
        average_charges = calculate_average_cost('sex', gender)
        average_charges_by_gender[gender] = average_charges
    return average_charges_by_gender


average_charges_by_gender = calculate_average_charges_by_gender()
print("Average insurance charges by gender:")
for gender, average_charges in average_charges_by_gender.items():
    print(f"Gender: {gender}, Average Charges: {average_charges}")


# Perform statistical test (e.g., t-test) to determine if there is a significant difference
from scipy.stats import ttest_ind

male_charges = df[df['sex'] == 'male']['charges']
female_charges = df[df['sex'] == 'female']['charges']

t_statistic, p_value = ttest_ind(male_charges, female_charges)
if p_value < 0.05:
    print("There is a significant difference in insurance charges between genders.")
else:
    print("There is no significant difference in insurance charges between genders.")


# Additional Questions and Analysis

# Step 4: Using Lists, Strings, and Dictionaries
regions = df['region'].unique().tolist()
average_charges_by_region = {}

for region in regions:
    average_charges = calculate_average_cost('region', region)
    average_charges_by_region[region] = average_charges

print("Average insurance charges by region:")
for region, average_charges in average_charges_by_region.items():
    print(f"Region: {region}, Average Charges: {average_charges}")

# Concatenate strings
age_group = "18-24"
gender = "male"
insurance_cost = average_charges_by_gender[gender]
output_string = "The average insurance cost for " + gender + " in the age group " + age_group + " is $" + str(insurance_cost)
print(output_string)

# Create a dictionary
patient_info = {
    "name": "John Doe",
    "age": 35,
    "gender": "male",
    "smoker": "no",
    "region": "southwest"
}
print("Patient Information:")
for key, value in patient_info.items():
    print(f"{key.capitalize()}: {value}")


Saving insurance.csv to insurance (3).csv
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
The biggest factor that can cause higher insurance costs is: age
The average insurance cost for smokers is: 32050.23183153284
The age group with the highest BMI is: 64
Average insurance charges by number of children:
Number of Children: 0, Average Charges: 12365.97560163589
Number of Children: 1, Average Charges: 12731.171831635802
Number of Children: 2, Average Charges: 15073.563733958332
Number of Children: 3, Average Charges: 15355.318366815285
Number of Children: 4, Average Charges: 13850.6563112
Number of Children: 5, Average Charges: 8786.035247222222
Average in