In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm

insurance_df = pd.read_csv("../Data/insurance.csv")

In [0]:
insurance_df.head()

In [0]:
insurance_df.corr(numeric_only=True)

In [0]:
sns.scatterplot(insurance_df, x="age", y="charges")

In [0]:
features = ["age", "bmi", "children"]

X = sm.add_constant(insurance_df[features])
y = insurance_df["charges"]

model = sm.OLS(y, X).fit()

model.summary()

In [0]:
sns.scatterplot(x=model.predict(), y=model.resid)

## QQ Plot

In [0]:
import scipy.stats as stats
import matplotlib.pyplot as plt

stats.probplot(model.resid, dist="norm", plot=plt);

## Log Transform

In [0]:
features = ["age", "bmi", "children"]

X = sm.add_constant(insurance_df[features])
y = np.log(insurance_df["charges"])

model = sm.OLS(y, X).fit()

model.summary()

In [0]:
sns.scatterplot(x=model.predict(), y=model.resid);

In [0]:
stats.probplot(model.resid, dist="norm", plot=plt);

In [0]:
# Residual Plot Helper Functions

def residual_analysis_plots(model):
    
    import scipy.stats as stats
    import matplotlib.pyplot as plt
    
    predictions = model.predict()
    residuals = model.resid
    
    fig, ax = plt.subplots(1, 2, sharey="all", figsize=(10, 6))
    
    sns.scatterplot(x=predictions, y=residuals, ax=ax[0])
    ax[0].set_title("Residual Plot")
    ax[0].set_xlabel("Prediction")
    ax[0].set_ylabel("Residuals")
    
    stats.probplot(residuals, dist="norm", plot=ax[1])
    ax[1].set_title("Normal Q-Q Plot")   

In [0]:
residual_analysis_plots(model)

In [0]:
insurance_df = insurance_df.assign(
    smoker_flag=np.where(insurance_df["smoker"] == "yes", 1, 0),
    female_flag=np.where(insurance_df["sex"] == "female", 1, 0),
    bmi_smoker_int = lambda x: x["bmi"] * x["smoker_flag"],
#     age_bmi_int = insurance_df["bmi"] * insurance_df["age"],
#     smoker_sex_int = insurance_df["smoker_flag"] * insurance_df["female_flag"]
#     age_smoker_int = insurance_df["age"] * insurance_df["smoker_flag"],
    age_sq = insurance_df["age"] ** 2,
)#.query("charges < 20000")



In [0]:
sns.lmplot(insurance_df, x="bmi", y="charges", hue="smoker_flag")

In [0]:
# Introduces Perfect Multicollinearity

insurance_df["age2x"] = insurance_df["age"] * 2

### Attempts to fix normality issues - feature engineering & Target Transforms

In [0]:
features = [
    "age", 
#     "age2x"
    "age_sq",
    "bmi", 
    "children", 
    "smoker_flag", 
#     "female_flag", 
#     "bmi_smoker_int", 
#     "age_bmi_int",
#     "smoker_sex_int"
]

X = sm.add_constant(insurance_df[features])
y = insurance_df["charges"]
# y = np.sqrt(insurance_df["charges"])
# y = np.log(insurance_df["charges"])
# y = np.log10(insurance_df["charges"])
# y = 1/(insurance_df["charges"])

#box-cox

from scipy.stats import boxcox

# y, fitted_lambda = boxcox(insurance_df["charges"])

# sns.histplot(y).set(xlabel="box_cox_price", title="Box-Cox Transformation")

# sns.despine()

model = sm.OLS(y, X).fit()

model.summary()

### Multicollinearity & VIF

In [0]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

pd.Series(
    [vif(X.values, i) for i in range(X.shape[1])],
    index=X.columns
)

### Target Transformation Distributions

In [0]:
# y = insurance_df["charges"]
# y = np.sqrt(insurance_df["charges"])
y = np.log(insurance_df["charges"])
# y = np.log10(insurance_df["charges"])
# y = 1/(insurance_df["charges"])

sns.histplot(y);

In [0]:
residual_analysis_plots(model)

In [0]:
from sklearn.metrics import mean_absolute_error as mae

mae(y, model.predict())
# mae(np.exp(y), np.exp(model.predict()))

In [0]:
features = [
    "age", 
    "bmi", 
    "children", 
]

X = sm.add_constant(insurance_df[features])

y = insurance_df["charges"]

model = sm.OLS(y, X).fit()

model.summary()

In [0]:
residual_analysis_plots(model)

### Outliers & Influence

In [0]:
influence = model.get_influence()
inf_summary_df = influence.summary_frame()
inf_summary_df.head()

In [0]:
inf_summary_df["cooks_d"].sort_values(ascending=False).round(2)