# Support Vector Machine Algorithm -  insurance_charge_prediction
# Using Grid Search CV 
## MODEL CREATION PHASE

## read the dataset

In [1]:
import pandas as pd 
dataset = pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


## convert categorical data into numerical data

In [2]:
dataset = pd.get_dummies(dataset,drop_first=True,dtype=int)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


## split Input and Output

In [3]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [4]:
independent = dataset[['age', 'bmi', 'children',  'sex_male', 'smoker_yes' ]]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [5]:
dependent = dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


## split train and test

In [36]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.20, random_state=0)

## model creation 

In [65]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
""" "kernel":["linear", "poly", "rbf", "sigmoid"],   """

# parameter grid for SVR model 
param_grid = {
                "kernel":["linear"],
                "C":[10,100,500,1000,2000,3000],
                "gamma": ["scale", "auto"]
             }
# model 
# model = SVR()
# model creation using grid search CV 
""" 
refit True - to select the best model 
"""
grid_model= GridSearchCV(SVR(),param_grid, refit=True, verbose=3, n_jobs=-1)
# grid_model.fit(independent, dependent)   # passing the input and output data

grid_model.fit(x_train, y_train)



Fitting 5 folds for each of 12 candidates, totalling 60 fits


  y = column_or_1d(y, warn=True)


## Grid Search CV results 

In [66]:
# result viewed in dictionary format
gridcv_result = grid_model.cv_results_
gridcv_result

{'mean_fit_time': array([0.34772229, 0.1795857 , 0.50296245, 0.33627744, 0.79130278,
        0.76552186, 1.2898294 , 1.23538642, 2.22269044, 2.18236637,
        3.95177469, 3.56553669]),
 'std_fit_time': array([0.10748336, 0.0516661 , 0.0879525 , 0.06086922, 0.03664506,
        0.020791  , 0.1406598 , 0.16006287, 0.37552414, 0.34245296,
        1.31648427, 1.18938406]),
 'mean_score_time': array([0.04892688, 0.02238727, 0.02562218, 0.02313538, 0.01853929,
        0.01892686, 0.01870227, 0.01897879, 0.02409325, 0.01871543,
        0.02145123, 0.02023387]),
 'std_score_time': array([0.01933014, 0.00326802, 0.00773425, 0.00497011, 0.00037934,
        0.00070072, 0.00042532, 0.00091818, 0.00916365, 0.0006074 ,
        0.00396384, 0.00361552]),
 'param_C': masked_array(data=[10, 10, 100, 100, 500, 500, 1000, 1000, 2000, 2000,
                    3000, 3000],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        f

In [67]:
# result viewed in tabular format
gridcv_result_table = pd.DataFrame.from_dict(gridcv_result)
gridcv_result_table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.347722,0.107483,0.048927,0.01933,10,scale,linear,"{'C': 10, 'gamma': 'scale', 'kernel': 'linear'}",-0.100231,0.066477,-0.039783,-0.074946,-0.09013,-0.047723,0.060665,11
1,0.179586,0.051666,0.022387,0.003268,10,auto,linear,"{'C': 10, 'gamma': 'auto', 'kernel': 'linear'}",-0.100231,0.066477,-0.039783,-0.074946,-0.09013,-0.047723,0.060665,11
2,0.502962,0.087952,0.025622,0.007734,100,scale,linear,"{'C': 100, 'gamma': 'scale', 'kernel': 'linear'}",0.486967,0.55178,0.527337,0.489634,0.460014,0.503147,0.03242,9
3,0.336277,0.060869,0.023135,0.00497,100,auto,linear,"{'C': 100, 'gamma': 'auto', 'kernel': 'linear'}",0.486967,0.55178,0.527337,0.489634,0.460014,0.503147,0.03242,9
4,0.791303,0.036645,0.018539,0.000379,500,scale,linear,"{'C': 500, 'gamma': 'scale', 'kernel': 'linear'}",0.579134,0.625155,0.615088,0.580795,0.547853,0.589605,0.027726,7
5,0.765522,0.020791,0.018927,0.000701,500,auto,linear,"{'C': 500, 'gamma': 'auto', 'kernel': 'linear'}",0.579134,0.625155,0.615088,0.580795,0.547853,0.589605,0.027726,7
6,1.289829,0.14066,0.018702,0.000425,1000,scale,linear,"{'C': 1000, 'gamma': 'scale', 'kernel': 'linear'}",0.590797,0.666805,0.636817,0.60769,0.571647,0.614751,0.033699,5
7,1.235386,0.160063,0.018979,0.000918,1000,auto,linear,"{'C': 1000, 'gamma': 'auto', 'kernel': 'linear'}",0.590797,0.666805,0.636817,0.60769,0.571647,0.614751,0.033699,5
8,2.22269,0.375524,0.024093,0.009164,2000,scale,linear,"{'C': 2000, 'gamma': 'scale', 'kernel': 'linear'}",0.59654,0.681624,0.704634,0.696767,0.656119,0.667137,0.038985,1
9,2.182366,0.342453,0.018715,0.000607,2000,auto,linear,"{'C': 2000, 'gamma': 'auto', 'kernel': 'linear'}",0.59654,0.681624,0.704634,0.696767,0.656119,0.667137,0.038985,1


In [68]:
# best parameters -(syntax: model.best_params_ )
grid_model.best_params_

{'C': 2000, 'gamma': 'scale', 'kernel': 'linear'}

In [69]:
#  best model 
grid_model.best_estimator_

## Model Accuracy 

In [74]:
""" 
MODEL ACCURACY Analysis:-
---- by passing 'independent and dependent' to the grid_model
parameter                                                      Accuracy 
{'C': 1000, 'gamma': 'scale', 'kernel': 'linear'} -      0.6937782148821102     ( comparatively best model )
{'C': 3000, 'gamma': 'auto', 'kernel': 'rbf'} -          0.015505864908097754
{'C': 10, 'gamma': 'auto', 'kernel': 'sigmoid'} -        -0.10475314338568928

--- by passing 'train test split' to the grid_model 
parameter                                                      Accuracy 
{'C': 2000, 'gamma': 'scale', 'kernel': 'linear'} -      0.667136907660572     ( comparatively best model )
{'C': 3000, 'gamma': 'auto', 'kernel': 'rbf'} -          -0.021844607356429256
{'C': 10, 'gamma': 'auto', 'kernel': 'sigmoid'} -        -0.10306385877865512
"""

# best accuracy score 
grid_model.best_score_

np.float64(0.667136907660572)

## save the model 

In [71]:
# import pickle 
# pickle.dump(regressor, open(" ","wb"))

In [72]:
future_prediction = grid_model.predict([[ 52, 30.200, 1, 1, 0 ]])
future_prediction



array([10220.32601213])