# Supprt Vector Regression

In [83]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [84]:
import seaborn as sns
df = sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [86]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [87]:
#train test split
X = df[['tip', 'sex', 'smoker', 'day', 'time', 'size']]
Y = df['total_bill'] #this will be a series as its dependent feature

In [88]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

## Feature encoding (Label and OHE Encoding)

### Why Train-Test Split is Important Before Encoding:
By splitting the dataset first (using `train_test_split`), we ensure that the test data remains **unseen** during feature engineering and training. Then, we apply encoding separately:

- **Training Set (`X_train`)**:
  - Fit and transform encoders (e.g., `LabelEncoder`, `OneHotEncoder`) on the training data. 
  - This ensures the model learns from patterns present in the training set only.

- **Test Set (`X_test`)**:
  - Use the already fitted encoders (from the training set) to transform the test data.
  - **Do not fit** encoders on the test data to avoid peeking into the test set.

---

### If you perform encoding before splitting (during EDA), the train-test split function **does not account for leakage from encoding**. Here's why:

- When encoding before splitting, all rows (training and test) are processed together. 
- This makes test data patterns influence the encoding process, causing **data leakage**.


In [89]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [90]:
le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()

In [91]:
# we have binary features so we apply label encoder
X_train['sex'] = le1.fit_transform(X_train['sex'])
X_train['smoker'] = le2.fit_transform(X_train['smoker'])
X_train['time'] = le3.fit_transform(X_train['time'])

In [92]:
X_train.head()

Unnamed: 0,tip,sex,smoker,day,time,size
234,3.0,1,1,Sat,0,2
227,3.0,1,0,Sat,0,4
180,3.68,1,1,Sun,0,4
5,4.71,1,0,Sun,0,4
56,3.0,1,1,Sat,0,4


In [93]:
# we dont do ft_transform on test data we do transform to prevent dataleakage
X_test['sex'] = le1.transform(X_test['sex'])
X_test['smoker'] = le2.transform(X_test['smoker'])
X_test['time'] = le3.transform(X_test['time'])

In [94]:
X_test.head()

Unnamed: 0,tip,sex,smoker,day,time,size
24,3.18,1,0,Sat,0,2
6,2.0,1,0,Sun,0,2
153,2.0,1,0,Sun,0,4
211,5.16,1,1,Sat,0,4
198,2.0,0,1,Thur,1,2


### OHE for day column

In [95]:
df['day'].unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [96]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

### Why `drop='first'`?

- By **dropping the first category** (using `drop='first'`), we eliminate this redundancy.
- In the case of `color`, dropping the first category (e.g., `color_Red`) means we only keep two columns: `color_Green` and `color_Blue`.
- Now, if `color_Green = 1`, the model **knows** that `color_Red = 0` and `color_Blue = 0`, and vice versa.

In simpler terms:
- Dropping the first category allows the model to learn the information from the other columns while avoiding perfect correlation between the columns, which helps the model perform better.

### Example:

Suppose you have a column `category` with values: `['Red', 'Green', 'Blue']`.

With one-hot encoding, this becomes:

| Red | Green | Blue |
|-----|-------|------|
| 1   | 0     | 0    |
| 0   | 1     | 0    |
| 0   | 0     | 1    |

Now, if you drop the first column (say, `Red`):

| Green | Blue |
|-------|------|
| 0     | 0    |
| 1     | 0    |
| 0     | 1    |

Here, you are still able to capture all the information, but without the multicollinearity issue.

### Summary:
- Dropping the first category is done to avoid redundancy and improve the model's performance by removing the risk of multicollinearity.
- This is a standard practice, especially when dealing with categorical features with more than two categories.

In [97]:
ct = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first'), [3])
    ],
    remainder='passthrough'  # Keep other columns as they are
)

In [98]:
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)
# Apply fit_transform to X_train
X_train = ct.fit_transform(X_train)

In [99]:
X_train[:5]

array([[1.  , 0.  , 0.  , 3.  , 1.  , 1.  , 0.  , 2.  ],
       [1.  , 0.  , 0.  , 3.  , 1.  , 0.  , 0.  , 4.  ],
       [0.  , 1.  , 0.  , 3.68, 1.  , 1.  , 0.  , 4.  ],
       [0.  , 1.  , 0.  , 4.71, 1.  , 0.  , 0.  , 4.  ],
       [1.  , 0.  , 0.  , 3.  , 1.  , 1.  , 0.  , 4.  ]])

In [100]:
X_test = ct.transform(X_test)

In [101]:
X_test[:5]

array([[1.  , 0.  , 0.  , 3.18, 1.  , 0.  , 0.  , 2.  ],
       [0.  , 1.  , 0.  , 2.  , 1.  , 0.  , 0.  , 2.  ],
       [0.  , 1.  , 0.  , 2.  , 1.  , 0.  , 0.  , 4.  ],
       [1.  , 0.  , 0.  , 5.16, 1.  , 1.  , 0.  , 4.  ],
       [0.  , 0.  , 1.  , 2.  , 0.  , 1.  , 1.  , 2.  ]])

### Algorithm -SVR

In [107]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train, Y_train)
y_pred = svr.predict(X_test)

In [109]:
from sklearn.metrics import r2_score, mean_absolute_error
print('R2 Score', r2_score(Y_test, y_pred))
print('MAE', mean_absolute_error(Y_test, y_pred))

R2 Score 0.496241694353699
MAE 4.175959078886017


# Perform Hyperparameter Tuning using GridSearchCV and Check the Metrcis