In [5]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
data = pd.read_csv("Health_insurance.csv")

# Check the first few rows of the dataset
print(data.head())

# Check for missing values
print(data.isnull().sum())

# Visualize the number of smokers by sex
figure = px.histogram(data, x="sex", color="smoker", title="Number of Smokers")
figure.show()

# Map categorical data to numerical data for 'sex' and 'smoker'
data["sex"] = data["sex"].map({"female": 0, "male": 1})
data["smoker"] = data["smoker"].map({"no": 0, "yes": 1})

# Convert 'region' column to one-hot encoded columns
data = pd.get_dummies(data, columns=["region"], drop_first=True)

# Display the first few rows after mapping
print(data.head())

# Pie chart for regions - automatically use one-hot encoded column names
region_columns = [col for col in data.columns if col.startswith("region_")]
pie = data[region_columns].sum()
regions = pie.index.str.replace('region_', '')
population = pie.values
fig = px.pie(values=population, names=regions, title="Distribution of Regions")
fig.show()

# Check the correlation matrix
print(data.corr())

# Prepare data for training (adjust to include new columns)
x = data.drop(columns=["charges"]).values
y = data["charges"].values

# Split the data into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Regressor
forest = RandomForestRegressor(random_state=42)
forest.fit(xtrain, ytrain)

# Predict using the test data
ypred = forest.predict(xtest)

# Create a DataFrame for the predicted values
predicted_data = pd.DataFrame({"Predicted Premium Amount": ypred})
print(predicted_data.head())

# Evaluate the model
mse = mean_squared_error(ytest, ypred)
r2 = r2_score(ytest, ypred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")


   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


   age  sex     bmi  children  smoker      charges  region_northwest  \
0   19    0  27.900         0       1  16884.92400             False   
1   18    1  33.770         1       0   1725.55230             False   
2   28    1  33.000         3       0   4449.46200             False   
3   33    1  22.705         0       0  21984.47061              True   
4   32    1  28.880         0       0   3866.85520              True   

   region_southeast  region_southwest  
0             False              True  
1              True             False  
2              True             False  
3             False             False  
4             False             False  


                       age       sex       bmi  children    smoker   charges  \
age               1.000000 -0.020856  0.109272  0.042469 -0.025019  0.299008   
sex              -0.020856  1.000000  0.046371  0.017163  0.076185  0.057292   
bmi               0.109272  0.046371  1.000000  0.012759  0.003750  0.198341   
children          0.042469  0.017163  0.012759  1.000000  0.007673  0.067998   
smoker           -0.025019  0.076185  0.003750  0.007673  1.000000  0.787251   
charges           0.299008  0.057292  0.198341  0.067998  0.787251  1.000000   
region_northwest -0.000407 -0.011156 -0.135996  0.024806 -0.036945 -0.039905   
region_southeast -0.011642  0.017117  0.270025 -0.023066  0.068498  0.073982   
region_southwest  0.010016 -0.004184 -0.006205  0.021914 -0.036945 -0.043210   

                  region_northwest  region_southeast  region_southwest  
age                      -0.000407         -0.011642          0.010016  
sex                      -0.011156          0.017117 