In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm

insurance_df = pd.read_csv("../Data/insurance.csv")

In [0]:
insurance_df.head()

In [0]:
### Introducing Random columns will not decrease R-squared

# import numpy as np

# rng = np.random.default_rng(12345)

# insurance_df = insurance_df.assign(
#     rand_col = rng.random(len(insurance_df)),
#     rand_col2 = rng.random(len(insurance_df)),
#     rand_col3 = rng.random(len(insurance_df)),
# )

In [0]:
insurance_df.corr(numeric_only=True)

In [0]:
features = [
    "age", 
    "bmi", 
#     "children"
]

X = sm.add_constant(insurance_df[features])
y = insurance_df["charges"]

model = sm.OLS(y, X).fit()

model.summary()

### Metrics

In [0]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as rmse
from sklearn.metrics import r2_score as r2

print(f"MAE: {mae(y, model.predict())}")
print(f"RMSE: {rmse(y, model.predict(), squared=False)}")
print(f"R2: {r2(y, model.predict())}")


In [0]:
sns.scatterplot(x=model.predict(), y=model.resid)

### Making Predictions

In [0]:
new_data = pd.DataFrame({
    "age": [0, insurance_df["age"].min(), insurance_df["age"].mean(), insurance_df["age"].max()],
    "bmi": [0, insurance_df["bmi"].min(), insurance_df["bmi"].mean(), insurance_df["bmi"].max()],
})

new_data

In [0]:
X = sm.add_constant(new_data)

model.predict(X)

In [0]:
insurance_df = insurance_df.assign(
    smoker_flag=np.where(insurance_df["smoker"] == "yes", 1, 0),
    female_flag=np.where(insurance_df["sex"] == "female", 1, 0)
)

X = sm.add_constant(insurance_df[["age", "bmi", "children", "smoker_flag"]])
y = insurance_df["charges"]

model = sm.OLS(y, X).fit()

model.summary()

In [0]:
sns.scatterplot(x=model.predict(), y=model.resid).set(ylim=(-13000, 50000) )