In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder


In [2]:

data = pd.read_csv('insurance.csv')
data.head(), data.info(), data.describe()

FileNotFoundError: [Errno 2] No such file or directory: 'insurance.csv'

In [None]:
#EDA
# Focusing on feature relationships and data visualization

plt.figure(figsize=(10, 6))
sns.heatmap(data.select_dtypes(include=["number"]).corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap for Numerical Variables")
plt.show()





In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(data=data, x="charges")
plt.title("Boxplot of Medical Charges")
plt.show()

In [None]:
sns.pairplot(data, vars=["age","bmi","children","charges"], hue="smoker", palette="viridis")
plt.suptitle("Pairplot of Numerical Features with Charges", y=1.02)
plt.show()

In [None]:
#Encode categorical variables for analysis
# Encode categorical variables
encoded_data = data.copy()
encoded_data["sex"] = encoded_data["sex"].map({"male": 0, "female": 1})
encoded_data["smoker"] = encoded_data["smoker"].map({"no": 0, "yes": 1})

# One-hot encode the 'region' column and concatenate it with the original dataset
label_encoder = LabelEncoder()
encoded_data['region'] = label_encoder.fit_transform(encoded_data['region'])

# Pair plot
sns.pairplot(encoded_data, vars=["age", "bmi", "children", "region","charges"], hue="smoker", palette="viridis")
plt.suptitle("Pairplot of Numerical Features with Charges", y=1.02)
plt.show()


In [None]:
encoded_data.info()


In [None]:
encoded_data

In [None]:
encoded_data.describe(include='all'),encoded_data.isnull().sum()

In [None]:
# Scatterplots for relationships between numerical features and charges
plt.figure(figsize=(12, 6))
plt.subplot(1, 3, 1)
sns.scatterplot(data=encoded_data, x="age", y="charges", hue="smoker", palette="viridis")
plt.title("Age vs Charges")

plt.subplot(1, 3, 2)
sns.scatterplot(data=encoded_data, x="bmi", y="charges", hue="smoker", palette="viridis")
plt.title("BMI vs Charges")

plt.subplot(1, 3, 3)
sns.scatterplot(data=encoded_data, x="children", y="charges", hue="smoker", palette="viridis")
plt.title("Children vs Charges")

plt.tight_layout()
plt.show()

# Histogram of charges by smoker status
custom_palette={"yes":"green", "no":"blue"}
plt.figure(figsize=(10, 6))
sns.histplot(data=data, x="charges", hue="smoker", kde=True, element="step", palette=custom_palette)
plt.title("Distribution of Charges by Smoker Status")
plt.show()


In [None]:
#Train the model 


In [None]:
from sklearn.model_selection import train_test_split

# Splitting the data into dependent variables and independent vars
X= encoded_data.drop(columns=["charges"])
y=encoded_data["charges"]

# training 70%, validation 10%, testing 30%
X_train, X_temp, y_train, y_temp = train_test_split(X,y,test_size=0.3,random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp,y_temp,test_size=0.67,random_state=42)

# The shape of the splits
X_train.shape,X_val.shape,X_test.shape, y_train.shape, y_val.shape, y_test.shape



In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

#Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions on the datasets
y_train_pred=model.predict(X_train)
y_val_pred=model.predict(X_val)
y_test_predict=model.predict(X_test)

In [None]:
pip install ace-tools


In [None]:
#Calculating the mterics for the model
train_mae= mean_absolute_error(y_train,y_train_pred)
val_mae=mean_absolute_error(y_val,y_val_pred)
test_mae=mean_absolute_error(y_test,y_test_predict)

train_mse= mean_squared_error(y_train,y_train_pred)
val_mse=mean_squared_error(y_val,y_val_pred)
test_mse=mean_squared_error(y_test,y_test_predict)

train_rmse=np.sqrt(train_mse)
val_rmse=mean_squared_error(y_val,y_val_pred)
test_rmse=mean_squared_error(y_test,y_test_predict)

train_r2=r2_score(y_train,y_train_pred)
val_r2= r2_score(y_val,y_val_pred)
test_r2=r2_score(y_test,  y_test_predict)

#Compiling results into a dictionary
results={
    "Dataset":["Training","Validation","Testing"],
    "MAE":[train_mae, val_mae,test_mae],
    "MSE":[train_mse, val_mse,test_mse],
    "RMSE":[train_rmse, val_rmse,test_rmse],
    "R2 Score":[train_r2, val_r2,test_r2],

}

results_df= pd.DataFrame(results)
# print(results_df)

from IPython.display import display
display(results_df)


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and train the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

# Predictions on all datasets
y_train_rf_pred = rf_model.predict(X_train)
y_val_rf_pred = rf_model.predict(X_val)
y_test_rf_pred = rf_model.predict(X_test)

# Calculate metrics for Random Forest
rf_train_mae = mean_absolute_error(y_train, y_train_rf_pred)
rf_val_mae = mean_absolute_error(y_val, y_val_rf_pred)
rf_test_mae = mean_absolute_error(y_test, y_test_rf_pred)

rf_train_mse = mean_squared_error(y_train, y_train_rf_pred)
rf_val_mse = mean_squared_error(y_val, y_val_rf_pred)
rf_test_mse = mean_squared_error(y_test, y_test_rf_pred)

rf_train_rmse = np.sqrt(rf_train_mse)
rf_val_rmse = np.sqrt(rf_val_mse)
rf_test_rmse = np.sqrt(rf_test_mse)

rf_train_r2 = r2_score(y_train, y_train_rf_pred)
rf_val_r2 = r2_score(y_val, y_val_rf_pred)
rf_test_r2 = r2_score(y_test, y_test_rf_pred)

# Compile results into a dictionary
rf_results = {
    "Dataset": ["Training", "Validation", "Testing"],
    "MAE": [rf_train_mae, rf_val_mae, rf_test_mae],
    "MSE": [rf_train_mse, rf_val_mse, rf_test_mse],
    "RMSE": [rf_train_rmse, rf_val_rmse, rf_test_rmse],
    "R2 Score": [rf_train_r2, rf_val_r2, rf_test_r2]
}

# Create a DataFrame to display results
rf_results_df = pd.DataFrame(rf_results)
print(rf_results_df)

In [None]:
pip install shap 

In [None]:
# Perform SHAP analysis for Linear Regression
import shap as shap
linear_explainer = shap.Explainer(model, X_train)  # SHAP for Linear Regression
linear_shap_values = linear_explainer(X_train)

# SHAP Summary plot for Linear Regression
print("SHAP Analysis for Linear Regression")
shap.summary_plot(linear_shap_values, X_train, plot_type="bar")

# Perform SHAP analysis for Random Forest
rf_explainer = shap.Explainer(rf_model, X_train)  # SHAP for Random Forest
rf_shap_values = rf_explainer(X_train)

# SHAP Summary plot for Random Forest
print("SHAP Analysis for Random Forest")
shap.summary_plot(rf_shap_values, X_train, plot_type="bar")


In [None]:
pip show shap

In [None]:
from sklearn.model_selection import train_test_split

# Splitting data into features and target variable
X = encoded_data.drop(columns=["charges"])
y = encoded_data["charges"]

# Splitting the dataset into training (70%), validation (10%), and testing (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.67, random_state=42)

# Print the shape of the splits
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape


In [None]:
# Fixing SHAP compatibility by setting up the explainer correctly
shap.initjs()  # Initialize JS visualization for SHAP

# Define a Kernel Explainer for SHAP (compatible with sklearn models)
explainer = shap.KernelExplainer(model.predict, X_train[:100])  # Using a subset of the data for efficiency
shap_values = explainer.shap_values(X_train[:100])

# Summary plot for SHAP
shap.summary_plot(shap_values, X_train[:100], plot_type="bar")


In [None]:
# Plotting the coefficients directly without external tools
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x="Coefficient", y="Feature", orient="h")
plt.title("Feature Importance Based on Linear Regression Coefficients")
plt.xlabel("Coefficient Value")
plt.ylabel("Feature")
plt.show()


In [None]:
# Correctly formatted input data
input_data = pd.DataFrame({
    "age": [50],                # Age
    "sex": [0],                 # Male
    "bmi": [25],                # Assuming an average BMI
    "children": [4],            # Number of children
    "smoker": [1],              # Smoker (1 = yes)
    "region": [1],    # Northwest region
       # Not Southeast
       # Not Southwest
})

# Predict using the trained Random Forest model
predicted_price = rf_model.predict(input_data)
predicted_price[0]  # Display the predicted price


In [None]:
# Correctly formatted input data
input_data = pd.DataFrame({
    "age": [50],                # Age
    "sex": [0],                 # Male
    "bmi": [25],                # Assuming an average BMI
    "children": [4],            # Number of children
    "smoker": [0],              # Smoker (1 = yes)
    "region": [1],    # Northwest region
       # Not Southeast
       # Not Southwest
})

# Predict using the trained Random Forest model
predicted_price = rf_model.predict(input_data)
predicted_price[0]  # Display the predicted price