# Support Vector Regression (SVR)

In [92]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
import numpy as np

In [93]:
# Importing the dataset
df = pd.read_csv('insurance_.csv')

In [94]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0.0,yes,southwest,16884.924
1,18,male,33.77,1.0,no,southeast,1725.5523
2,28,male,33.0,3.0,no,southeast,4449.462
3,33,male,22.705,0.0,no,northwest,21984.47061
4,32,male,28.88,0.0,no,northwest,3866.8552


In [95]:
df.isnull().sum()

age         0
sex         0
bmi         3
children    1
smoker      0
region      0
charges     4
dtype: int64

In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1335 non-null   float64
 3   children  1337 non-null   float64
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1334 non-null   float64
dtypes: float64(3), int64(1), object(3)
memory usage: 73.3+ KB


In [97]:
# Handling missing values
numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = df.select_dtypes(include="object").columns

In [98]:
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode())

In [99]:
# Converting object columns to numerical using OneHotEncoder
encoder = OneHotEncoder()
encoded_cols = encoder.fit_transform(df[categorical_cols]).toarray()

In [100]:
feature_names = encoder.get_feature_names_out(categorical_cols)

In [101]:
df = pd.concat([df[numerical_cols], pd.DataFrame(encoded_cols, columns=feature_names)], axis=1)

In [102]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.20, random_state= 1)

In [103]:
x

array([[-1.01, -0.45, -0.91,  1.97],
       [ 0.99,  0.51, -0.08, -0.51],
       [ 0.99,  0.39,  1.58, -0.51],
       ...,
       [-1.01,  1.02, -0.91, -0.51],
       [-1.01, -0.8 , -0.91, -0.51],
       [-1.01, -0.26, -0.91,  1.97]])

In [104]:
y

array([[ 0.3 ],
       [-0.96],
       [-0.73],
       ...,
       [-0.96],
       [-0.93],
       [ 1.31]])

In [105]:
y = y.reshape(len(y),1)

In [106]:
y

array([[ 0.3 ],
       [-0.96],
       [-0.73],
       ...,
       [-0.96],
       [-0.93],
       [ 1.31]])

In [107]:
# Feature Scaling
sc_x = StandardScaler()
sc_y = StandardScaler()
x = sc_x.fit_transform(x)
y = sc_y.fit_transform(y)

In [108]:
print("sc_x.fit_transform(x):\n", sc_x.fit_transform(x))
print("\nsc_y.fit_transform(y):\n", sc_y.fit_transform(y))

sc_x.fit_transform(x):
 [[-1.01 -0.45 -0.91  1.97]
 [ 0.99  0.51 -0.08 -0.51]
 [ 0.99  0.39  1.58 -0.51]
 ...
 [-1.01  1.02 -0.91 -0.51]
 [-1.01 -0.8  -0.91 -0.51]
 [-1.01 -0.26 -0.91  1.97]]

sc_y.fit_transform(y):
 [[ 0.3 ]
 [-0.96]
 [-0.73]
 ...
 [-0.96]
 [-0.93]
 [ 1.31]]


In [109]:
# Training the SVR model on the whole dataset
regressor = SVR(kernel = 'rbf')
regressor.fit(x, y)

  y = column_or_1d(y, warn=True)


In [110]:
# Predicting a new result
# Create a 2D array with 4 features
input_data = np.array([[6.5, 3.2, 4.1, 1.3]])
# Predict using the regressor
predicted_value = regressor.predict(sc_x.transform(input_data))
# Inverse transform the predicted value
inverse_transformed_value = sc_y.inverse_transform(predicted_value.reshape(-1, 1))
print(inverse_transformed_value)

[[0.33]]


In [111]:
# Predicting the Test set results
y_pred = sc_y.inverse_transform(regressor.predict(sc_x.transform(x_test)).reshape(-1,1))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[-3.94e-01 -9.62e-01]
 [-5.38e-01 -1.60e-01]
 [-3.52e-01 -3.71e-01]
 [-3.04e-01 -2.40e-01]
 [-6.20e-01 -9.24e-01]
 [ 2.21e+00  2.10e+00]
 [-4.29e-01 -3.29e-01]
 [-5.66e-01 -1.34e-01]
 [-7.22e-01 -8.44e-01]
 [ 7.13e-01  5.17e-01]
 [-6.02e-01 -5.41e-02]
 [-2.83e-01 -1.44e-01]
 [-4.65e-01 -5.74e-01]
 [-5.09e-01 -5.15e-01]
 [-4.31e-01 -1.00e+00]
 [-4.81e-01 -3.57e-01]
 [-7.99e-01  7.19e-01]
 [-2.85e-01 -5.68e-01]
 [-4.60e-01  1.24e+00]
 [-5.29e-01  1.47e-02]
 [-4.74e-01 -2.94e-01]
 [ 2.39e+00  2.29e+00]
 [-5.73e-01 -4.35e-01]
 [-7.06e-01 -4.00e-01]
 [-5.53e-01 -9.16e-01]
 [-5.73e-01 -5.47e-01]
 [-4.66e-01 -3.87e-01]
 [-5.28e-01 -4.13e-01]
 [-3.57e-01 -6.54e-01]
 [-7.37e-01 -7.97e-01]
 [-4.60e-01 -1.33e-01]
 [-4.84e-01 -1.28e-01]
 [ 5.41e-01  9.58e-01]
 [ 1.68e+00  1.69e+00]
 [ 9.15e-01  9.01e-01]
 [-7.64e-01 -2.83e-01]
 [ 2.12e+00  1.95e+00]
 [ 4.75e-01  3.85e-01]
 [-5.18e-01 -1.18e-01]
 [ 1.63e+00  2.19e+00]
 [-3.57e-01 -7.38e-01]
 [-4.32e-01 -1.27e-01]
 [-2.84e-01  5.34e-01]
 [-4.50e-01

In [112]:
# Evaluating the Model Performance
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.7607551627297912