# Polynomial Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('../../Datasets/insurance.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [3]:
# One hot encoding: turn string column into 3 different columns (countries), for 3 different categories
# Bindary vector: each country to a certain order of the columns

# Coding independent variable
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Create object of the column transformer class
# [0] is the index of column to apply OneHotEncoding
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,4,5])], remainder='passthrough')

# Apply the transform method to change the column X
X = np.array(ct.fit_transform(X))

print(X)

[[1.0 0.0 0.0 ... 19 27.9 0]
 [0.0 1.0 1.0 ... 18 33.77 1]
 [0.0 1.0 1.0 ... 28 33.0 3]
 ...
 [1.0 0.0 1.0 ... 18 36.85 0]
 [1.0 0.0 1.0 ... 21 25.8 0]
 [1.0 0.0 0.0 ... 61 29.07 0]]


## Splitting the dataset into the Training set and Test set

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Polynomial Regression model on the Training set

In [5]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression(normalize=True)
regressor.fit(X_poly, y_train)

LinearRegression(normalize=True)

## Predicting the Test set results

In [6]:
y_pred = regressor.predict(poly_reg.transform(X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[12136.    9724.53]
 [10542.    8547.69]
 [48248.   45702.02]
 [12890.   12950.07]
 [ 9674.    9644.25]
 [ 3706.    4500.34]
 [ 3514.    2198.19]
 [15746.   11436.74]
 [11938.    7537.16]
 [ 8410.    5425.02]
 [ 8524.    6753.04]
 [14276.   10493.95]
 [ 8880.    7337.75]
 [ 5634.    4185.1 ]
 [24014.   18310.74]
 [11916.   10702.64]
 [12922.   12523.6 ]
 [ 4816.    3490.55]
 [ 9434.    6457.84]
 [30330.   33475.82]
 [25490.   23967.38]
 [16218.   12643.38]
 [12090.   23045.57]
 [28330.   23065.42]
 [ 5610.    1674.63]
 [ 6082.    4667.61]
 [ 3194.    3732.63]
 [ 8832.    7682.67]
 [ 5362.    3756.62]
 [11554.    8413.46]
 [12594.    8059.68]
 [51518.   48970.25]
 [12986.   12979.36]
 [10938.   20630.28]
 [16106.   14571.89]
 [ 6362.    4137.52]
 [11224.    8347.16]
 [37342.   51194.56]
 [37130.   40003.33]
 [ 1298.    1880.49]
 [ 5994.    5458.05]
 [ 6874.    2867.12]
 [26362.   20149.32]
 [46160.   47496.49]
 [35602.   36149.48]
 [ 5426.   26018.95]
 [11890.   19749.38]
 [ 9178.    6

## Evaluating the Model Performance

In [7]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8659089703391487

In [8]:
# Tuning the model

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score 
degrees = [2, 3, 4, 5, 6] # Change degree "hyperparameter" here
normalizes = [True, False] # Change normalize hyperparameter here
best_score = 0
best_degree = 0
for degree in degrees:
    for normalize in normalizes:
        poly_features = PolynomialFeatures(degree = degree)
        X_train_poly = poly_features.fit_transform(X_train)
        polynomial_regressor = LinearRegression(normalize=normalize)
        polynomial_regressor.fit(X_train_poly, y_train)
        scores = cross_val_score(polynomial_regressor, X_train_poly, y_train, cv=5) # Change k-fold cv value here
        if max(scores) > best_score:
            best_score = max(scores)
            best_degree = degree
            best_normalize = normalize
           

In [9]:
print(best_score)
print(best_normalize)
print(best_degree)

0.8560595979126553
True
2
