In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm

insurance_df = pd.read_csv("../Data/insurance.csv")

In [0]:
insurance_df.head()

### Data Splitting

In [0]:
from sklearn.model_selection import train_test_split

insurance_df["smoker_flag"] = np.where(insurance_df["smoker"] == "yes", 1, 0)

features = ["age", "bmi", "children", "smoker_flag"]

X = sm.add_constant(insurance_df[features])
y = insurance_df["charges"]

X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=2023)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25, random_state=2024)

### Validation Scoring

In [0]:
model = sm.OLS(y_train, X_train).fit()

model.summary()

In [0]:
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_absolute_error as mae

print(f"Train R2: {r2(y_train, model.predict(X_train))}")
print(f"Train MAE: {mae(y_train, model.predict(X_train))}")
print(f"Validation R2: {r2(y_valid, model.predict(X_valid))}")
print(f"Validation MAE: {mae(y_valid, model.predict(X_valid))}")


### Final Fit (on all training data) & Test Score

In [0]:
model = sm.OLS(y, X).fit()

print(f"Train R2: {r2(y, model.predict(X))}")
print(f"Train MAE: {mae(y, model.predict(X))}")

In [0]:
print(f"Test R2: {r2(y_test, model.predict(X_test))}")
print(f"Test MAE: {mae(y_test, model.predict(X_test))}")

### Cross Validation Loop

In [0]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score as r2


kf = KFold(n_splits=5, shuffle=True, random_state=2023)

# Create a list to store validation scores for each fold
cv_lm_r2s = []
cv_lm_mae = []

# Loop through each fold in X and y
for train_ind, val_ind in kf.split(X, y):
    # Subset data based on CV folds
    X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
    X_val, y_val = X.iloc[val_ind], y.iloc[val_ind]
    # Fit the Model on fold's training data
    model = sm.OLS(y_train, X_train).fit()
    # Append Validation score to list 
    cv_lm_r2s.append(r2(y_val, model.predict(X_val),))
    cv_lm_mae.append(mae(y_val, model.predict(X_val),))

print("All Validation R2s: ", [round(x, 3) for x in cv_lm_r2s])
print(f"Cross Val R2s: {round(np.mean(cv_lm_r2s), 3)} +- {round(np.std(cv_lm_r2s), 3)}")

print("All Validation MAEs: ", [round(x, 3) for x in cv_lm_mae])
print(f"Cross Val MAEs: {round(np.mean(cv_lm_mae), 3)} +- {round(np.std(cv_lm_mae), 3)}")

### Final Fit & Score On Test

In [0]:
model = sm.OLS(y, X).fit()

r2(y_test, model.predict(X_test))