In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm

insurance_df = pd.read_csv("../Data/insurance.csv")

insurance_df.head()

### EDA

In [0]:
sns.histplot(insurance_df["charges"]);

In [0]:
sns.pairplot(insurance_df, corner=True)

In [0]:
insurance_df.corr(numeric_only=True)

### Fitting the Model

In [0]:
X = sm.add_constant(insurance_df["age"])
y = insurance_df["charges"]

model = sm.OLS(y, X).fit()

model.summary()

### Making Predictions

In [0]:
customer_ages = [18, 25, 35, 45, 55]

X_predict = sm.add_constant(pd.DataFrame({"age": customer_ages}))

X_predict

In [0]:
predictions = model.predict(X_predict)

predictions.name = "predictions"

In [0]:
## Compare predictiosn to actual values

(insurance_df
 .query("age in @customer_ages")
 .groupby("age")
 .agg({"charges": "mean"})
 .reset_index()
 .merge(predictions, left_index=True, right_index=True)
 .assign(
     error = lambda x: x["predictions"] - x["charges"],
     pct_difference = lambda x: (x["error"] / x["charges"]) * 100
 
 )
)

In [0]:
# Plot Residuals -- three tiers here to try to predict with new features

sns.scatterplot(x=model.predict(), y=model.resid)