In the linear_model module of the sklearn library, I'm going to use L2 normalisation using the 'Ridge' model. This method L2 is pretty much simliar as L1.

In [2]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from math import sqrt

import numpy as np
import pandas as pd

# Data file root
INSURANCE_FILE_PATH = 'insurance.csv'

insurance_df = pd.read_csv(INSURANCE_FILE_PATH)  
insurance_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
insurance_df = pd.get_dummies(data=insurance_df, columns=['sex', 'smoker', 'region'])

In [6]:
X = insurance_df.drop(['charges'], axis=1)
X

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.900,0,1,0,0,1,0,0,0,1
1,18,33.770,1,0,1,1,0,0,0,1,0
2,28,33.000,3,0,1,1,0,0,0,1,0
3,33,22.705,0,0,1,1,0,0,1,0,0
4,32,28.880,0,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,0,1,1,0,0,1,0,0
1334,18,31.920,0,1,0,1,0,1,0,0,0
1335,18,36.850,0,1,0,1,0,0,0,1,0
1336,21,25.800,0,1,0,1,0,0,0,0,1


In [5]:
polynomial_transformer = PolynomialFeatures(4)  
polynomial_features = polynomial_transformer.fit_transform(X.values)  

In [9]:
features = polynomial_transformer.get_feature_names(X.columns)  

In [10]:
X = pd.DataFrame(polynomial_features, columns=features)  # 다항 입력 변수를 dataframe으로 만들어 준다
y = insurance_df[['charges']]  # 목표 변수 정의

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)


In [14]:
model = Ridge(alpha=0.01, max_iter=2000, normalize=True)
model.fit(X_train, y_train)

Ridge(alpha=0.01, max_iter=2000, normalize=True)

In [15]:
y_test_predict = model.predict(X_test)
y_train_predict = model.predict(X_train)

In [17]:
mse = mean_squared_error(y_train, y_train_predict)

print("Performance : training set")
print("-----------------------")
print(f'Error: {sqrt(mse)}')

mse = mean_squared_error(y_test, y_test_predict)
print("------------------------------")

print("Performance : testing set")
print("-----------------------")
print(f'Error: {sqrt(mse)}')

Performance : training set
-----------------------
Error: 4561.665097523854
------------------------------
Performance : testing set
-----------------------
Error: 4692.427560339786


Using the high regression model of the fourth term (quadratic), we can see that the performance is not that different between training and test sets.