In [655]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from feature_engine import encoding as ce
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score

In [656]:
# Import data
file_pw = "../data/insurance.csv"

df = pd.read_csv(file_pw)

In [657]:
# Removing features found insignificant on EDA
df.drop(columns=['sex', 'region'], inplace=True)

# Pipeline Constructor

In [658]:
X = df.drop(columns=['charges'])
y = df['charges']

In [659]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42
)

In [660]:
model = Pipeline(
    steps=[
        # encode categorical variables
        ('one_hot_encoder', ce.OneHotEncoder(variables = ['smoker'])),
        # polynomials
        ('poly_features', PolynomialFeatures(degree = 2, include_bias=False, interaction_only=True)),
        # scale features
        ('standardscaler', StandardScaler()),     
        # linear model
        ('lm', LinearRegression())
    ]
)

## Model Evaluation
- We are going to use R². It is a statiscal measure of how much of the variability in data is captured by the regression.
- The performance can be measured by using the procedure named cross-validation.  

In [661]:
scores = cross_val_score(model, X, y, cv=5)

In [662]:
print("Coefficient of determination = %0.2f with a standard deviation of %0.2f" % (scores.mean()*100, scores.std()*100))

Coefficient of determination = 83.35 with a standard deviation of 3.26


## Model Fitting

In [663]:
model.fit(X_train, y_train)