In [1]:
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [2]:
dataSet = pd.read_csv("insurance_pre.csv")

In [3]:
dataSet = pd.get_dummies(dataSet, drop_first = True, dtype = int)
dataSet

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [4]:
independent = dataSet[['age', 'bmi', 'children','sex_male', 'smoker_yes']]
dependent = dataSet['charges']

In [5]:
#split into training set and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, test_size = 1/3, random_state = 0)
X_train, X_test, y_train, y_test

(      age    bmi  children  sex_male  smoker_yes
 482    18  31.35         0         0           0
 338    50  32.30         1         1           1
 356    46  43.89         3         1           0
 869    25  24.30         3         0           0
 182    22  19.95         3         1           0
 ...   ...    ...       ...       ...         ...
 763    27  26.03         0         1           0
 835    42  35.97         2         1           0
 1216   40  25.08         0         1           0
 559    19  35.53         0         1           0
 684    33  18.50         1         0           0
 
 [892 rows x 5 columns],
       age     bmi  children  sex_male  smoker_yes
 578    52  30.200         1         1           0
 610    47  29.370         1         0           0
 569    48  40.565         2         1           1
 1034   61  38.380         0         1           0
 198    51  18.050         0         0           0
 ...   ...     ...       ...       ...         ...
 261    20  26.8

In [6]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train, X_test

(array([[-1.53963418,  0.11036616, -0.90788827, -0.98885138, -0.49929923],
        [ 0.74809711,  0.26412451, -0.0755796 ,  1.01127431,  2.00280702],
        [ 0.4621307 ,  2.13997636,  1.58903774,  1.01127431, -0.49929923],
        ...,
        [ 0.03318108, -0.90443894, -0.90788827,  1.01127431, -0.49929923],
        [-1.46814257,  0.7869029 , -0.90788827,  1.01127431, -0.49929923],
        [-0.46726014, -1.96941782, -0.0755796 , -0.98885138, -0.49929923]]),
 array([[ 0.89108031, -0.07576237, -0.0755796 ,  1.01127431, -0.49929923],
        [ 0.5336223 , -0.21009861, -0.0755796 , -0.98885138, -0.49929923],
        [ 0.6051139 ,  1.60182214,  0.75672907,  1.01127431,  2.00280702],
        ...,
        [-1.46814257,  0.65256665,  0.75672907, -0.98885138,  2.00280702],
        [-0.82471815,  0.37742013,  1.58903774,  1.01127431, -0.49929923],
        [ 0.89108031,  1.24817794,  0.75672907, -0.98885138, -0.49929923]]))

In [7]:
# Define parameter grid
param_grid = {
   'C': [10, 100, 1000, 2000, 3000],  # Regularization parameter
   #'epsilon': [0.01, 0.1, 0.2],  # Epsilon parameter
   'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Type of kernel
   #'degree': [2, 3, 4],  # Degree of the polynomial kernel (only for 'poly' kernel)
   'gamma': ['scale', 'auto'],  # Kernel coefficient (only for 'rbf', 'poly', 'sigmoid')   
   #'gamma': ['scale', 'auto', 0.1, 1, 10],  # Kernel coefficient (only for 'rbf', 'poly', 'sigmoid')   
   #'coef0': [0, 0.1, 0.5, 1],  # Independent term in kernel (only for 'poly' and 'sigmoid')
   #'shrinking': [True, False],  # Whether to use the shrinking heuristic
   #'tol': [1e-3, 1e-4],  # Tolerance for stopping criterion
   #'max_iter': [1000, 5000, -1]  # Maximum number of iterations (-1 means no limit)
}

In [8]:
# Set up GridSearchCV
gridSearchCV = GridSearchCV(estimator=SVR(), param_grid=param_grid, refit=True, cv=5, n_jobs=-1, verbose=3)

In [9]:
gridSearchCV.fit(X_train, y_train)
#gridSearchCV.fit(independent, dependent)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


In [10]:
#print best parameter after tuning
print(gridSearchCV.best_params_)

{'C': 3000, 'gamma': 'scale', 'kernel': 'poly'}


In [11]:
age_input=float(input("Age:"))
bmi_input=float(input("BMI:"))
children_input=float(input("Children:"))
sex_male_input=int(input("Sex Male 0 or 1:"))
smoker_yes_input=int(input("Smoker Yes 0 or 1:"))

Age: 32
BMI: 43
Children: 2
Sex Male 0 or 1: 0
Smoker Yes 0 or 1: 1


In [12]:
Future_Prediction=gridSearchCV.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]])
#change the paramter,play with it.
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[3316415.72004342]
