## QuantileRegressor   - regression problem (insurance charge prediction)

In [2]:
# read the dataset 
import pandas as pd
dataset = pd.read_csv("insurance_pre.csv")
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.9,0,yes,16884.924
1,18,male,33.77,1,no,1725.5523
2,28,male,33.0,3,no,4449.462
3,33,male,22.705,0,no,21984.47061
4,32,male,28.88,0,no,3866.8552


In [3]:
# convert the categorical data into numerical data 
dataset = pd.get_dummies(dataset, drop_first=True, dtype=int)
dataset.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.9,0,16884.924,0,1
1,18,33.77,1,1725.5523,1,0
2,28,33.0,3,4449.462,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.88,0,3866.8552,1,0


In [4]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [5]:
# split input and output 
independent = dataset[['age', 'bmi', 'children','sex_male', 'smoker_yes' ]]
dependent = dataset[[ 'charges']]

In [6]:
dependent.head()

Unnamed: 0,charges
0,16884.924
1,1725.5523
2,4449.462
3,21984.47061
4,3866.8552


In [7]:
independent.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.9,0,0,1
1,18,33.77,1,1,0
2,28,33.0,3,1,0
3,33,22.705,0,1,0
4,32,28.88,0,1,0


In [8]:
# split train & test 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,y_test = train_test_split(independent, dependent, test_size=0.20, random_state=0)

In [9]:
x_train.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
621,37,34.1,4,1,1
194,18,34.43,0,1,0
240,23,36.67,2,0,1
1168,32,35.2,2,1,0
1192,58,32.395,1,0,0


In [10]:
y_train.head()

Unnamed: 0,charges
621,40182.246
194,1137.4697
240,38511.6283
1168,4670.64
1192,13019.16105


In [11]:
# Standardization 
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

print("x_train: ",x_train, "\n\n x_test: ",x_test)

x_train:  [[-0.14853305  0.54530479  2.41394802  1.01506676  1.97125461]
 [-1.49780784  0.59867181 -0.89219519  1.01506676 -0.50729114]
 [-1.14273553  0.96092064  0.76087642 -0.98515688  1.97125461]
 ...
 [ 0.06451033 -0.91339361 -0.89219519  1.01506676 -0.50729114]
 [-1.42679338  0.77656186 -0.89219519  1.01506676 -0.50729114]
 [-0.4325909  -1.97749955 -0.06565939 -0.98515688 -0.50729114]] 

 x_test:  [[ 0.91668389 -0.08539629 -0.06565939  1.01506676 -0.50729114]
 [ 0.56161157 -0.21962242 -0.06565939 -0.98515688 -0.50729114]
 [ 0.63262604  1.59081313  0.76087642  1.01506676  1.97125461]
 ...
 [ 1.2717562   1.54472343 -0.89219519  1.01506676 -0.50729114]
 [-0.64563429  1.34581001  1.58741222 -0.98515688  1.97125461]
 [ 0.49059711 -0.95948331  1.58741222  1.01506676 -0.50729114]]


In [30]:
y_train = y_train.values.ravel()

In [38]:
# model creation 
from sklearn.linear_model import QuantileRegressor
# regressor = QuantileRegressor(quantile=0.5, alpha=0.1)
# regressor.fit(x_train,y_train)

from sklearn.model_selection import GridSearchCV
param_grid = {
    "quantile":[0.5],
    "alpha": [0, 0.001, 0.01, 0.1, 1],
    "solver":["highs-ds", "highs-ipm", "highs", "revised simplex"]
    
}
regressor_model = QuantileRegressor()
grid_model = GridSearchCV(regressor_model, param_grid, refit=True, verbose=3,n_jobs=-1)
grid_model.fit(x_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [23]:
# regressor_model.coef_

In [17]:
# regressor_model.intercept_

In [34]:
# model prediction
y_pred = grid_model.predict(x_test)
y_pred[1:10] #1st 10 rows

array([ 9211.51900463, 40840.3091914 , 12381.83366758,  9572.48749471,
        4851.15350326,  1101.3024168 , 11251.58877007,  7676.48871158,
        5523.05217364])

In [35]:
# evaluating the model 
from sklearn.metrics import r2_score
r2_score = r2_score(y_test,y_pred)
r2_score

0.7466715383346669

## check overt fit under fit  

In [36]:

train_score = grid_model.score(x_train,y_train) 
test_score = grid_model.score(x_test,y_test) 
print(
    "train_score: ",train_score ,"\n",
    "test_score: ",test_score ,"\n",
    "train and test difference: ", abs(train_score - test_score),"\n"
)
# 1. train and test difference:  0.007323343945995209  -- Model underfitting   ---> reduce alpha (Regularization constant L1 penalty)
# 2. train and test difference: 0.04391306519679372   -- poor performance
# 3. train and test difference: 0.09759570292960373  -- moderate performance

train_score:  0.6490758354050632 
 test_score:  0.7466715383346669 
 train and test difference:  0.09759570292960373 



In [37]:
# save the model 
# import pickle
# pickle.dump( obj, open(filename.sav,'wb'))