# Feature Engineering in Linear Model
Data : tips from seaborn
Target : tip
Preprocessing :
   1. One hot encoding : sex, smoker, time
   2. Binary encoding : day
   3. Robust scaler : total_bill
   4. No treatment : size       
   
Random state 10, data splitting 70 : 30, model Ridge default

> ## Library

In [1]:
!pip install category_encoders



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler, OneHotEncoder
import category_encoders as ce
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

> ## Data

In [3]:
tips = sns.load_dataset('tips')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


> ## One Hot Encoding

In [4]:
onehot = OneHotEncoder()

transformer = ColumnTransformer([
    ('onehot', onehot, ['sex', 'smoker', 'time'])
])

tips_encoded = pd.DataFrame(transformer.fit_transform(tips))
tips_encoded.columns = transformer.get_feature_names()
tips_encoded 

Unnamed: 0,onehot__x0_Female,onehot__x0_Male,onehot__x1_No,onehot__x1_Yes,onehot__x2_Dinner,onehot__x2_Lunch
0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...
239,0.0,1.0,1.0,0.0,1.0,0.0
240,1.0,0.0,0.0,1.0,1.0,0.0
241,0.0,1.0,0.0,1.0,1.0,0.0
242,0.0,1.0,1.0,0.0,1.0,0.0


> ## Binary Encoder

In [5]:
binary_encoder = ce.BinaryEncoder()

transformer = ColumnTransformer([
    ('binary encoding', binary_encoder, ['day'])
])

tips_binary = pd.DataFrame(transformer.fit_transform(tips))
tips_binary['day'] = tips['day']
tips_binary.drop_duplicates()

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,0,1,2,day
0,0,0,1,Sun
19,0,1,0,Sat
77,0,1,1,Thur
90,1,0,0,Fri


> ## Preprocessing Scheme

In [23]:
onehot = OneHotEncoder(drop = 'first') # Linear Model (k-1): Regression and Logistic Regression
# onehot = OneHotEncoder() # k-dummy : tree, random forest, boosting

transformer = ColumnTransformer([
    ('onehot', onehot, ['sex', 'smoker', 'time']),
    ('binary encoding', binary_encoder, ['day']),
    ('robust scaler', RobustScaler(), ['total_bill'])
], remainder = 'passthrough')

transformer.fit_transform(tips)

  elif pd.api.types.is_categorical(cols):


array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -7.46753247e-02,  1.01000000e+00,  2.00000000e+00],
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -6.91558442e-01,  1.66000000e+00,  3.00000000e+00],
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         2.98237477e-01,  3.50000000e+00,  3.00000000e+00],
       ...,
       [ 1.00000000e+00,  1.00000000e+00,  0.00000000e+00, ...,
         4.52226345e-01,  2.00000000e+00,  2.00000000e+00],
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         2.31910946e-03,  1.75000000e+00,  2.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         9.13729128e-02,  3.00000000e+00,  2.00000000e+00]])

> ## Data Splitting

In [24]:
var = ['sex', 'smoker', 'time', 'day', 'total_bill', 'size']

x = tips[var]
y = tips['tip']

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 10)

> ## Preprocess Fitting

In [26]:
x_train_preprocess = transformer.fit_transform(x_train)
x_test_preprocess = transformer.fit_transform(x_test)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [27]:
x_train_preprocess = pd.DataFrame(x_train_preprocess)
x_test_preprocess = pd.DataFrame(x_test_preprocess)
x_train_preprocess

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.0,1.0,0.0,0.0,0.0,1.0,-0.566396,2.0
1,1.0,0.0,0.0,0.0,1.0,0.0,-0.647696,3.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.316170,3.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.245709,2.0
4,1.0,1.0,0.0,0.0,1.0,0.0,2.081301,2.0
...,...,...,...,...,...,...,...,...
178,1.0,0.0,0.0,0.0,0.0,1.0,0.007227,3.0
179,1.0,0.0,0.0,0.0,1.0,0.0,0.367660,2.0
180,1.0,0.0,0.0,0.0,0.0,1.0,-0.382114,2.0
181,0.0,0.0,1.0,0.0,1.0,1.0,1.110208,6.0


In [28]:
transformer.transformers_[0][1].get_feature_names()

array(['x0_Male', 'x1_Yes', 'x2_Lunch'], dtype=object)

In [29]:
transformer.transformers_[1][1].get_feature_names()

['day_0', 'day_1', 'day_2']

In [30]:
feature_names = list(transformer.transformers_[0][1].get_feature_names()) + list(transformer.transformers_[1][1].get_feature_names()) + ['total, bill', 'size']
x_train_preprocess.columns = feature_names
x_test_preprocess.columns = feature_names
x_train_preprocess

Unnamed: 0,x0_Male,x1_Yes,x2_Lunch,day_0,day_1,day_2,"total, bill",size
0,1.0,1.0,0.0,0.0,0.0,1.0,-0.566396,2.0
1,1.0,0.0,0.0,0.0,1.0,0.0,-0.647696,3.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.316170,3.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.245709,2.0
4,1.0,1.0,0.0,0.0,1.0,0.0,2.081301,2.0
...,...,...,...,...,...,...,...,...
178,1.0,0.0,0.0,0.0,0.0,1.0,0.007227,3.0
179,1.0,0.0,0.0,0.0,1.0,0.0,0.367660,2.0
180,1.0,0.0,0.0,0.0,0.0,1.0,-0.382114,2.0
181,0.0,0.0,1.0,0.0,1.0,1.0,1.110208,6.0


**k-category = k - 1**

> ## Modeling

In [31]:
model = Ridge()
model.fit(x_train_preprocess, y_train)

Ridge()

In [32]:
y_pred = model.predict(x_test_preprocess)
mean_squared_error(y_test, y_pred)

1.0846156956704367

> ## Coef

In [33]:
feature_names

['x0_Male',
 'x1_Yes',
 'x2_Lunch',
 'day_0',
 'day_1',
 'day_2',
 'total, bill',
 'size']

In [34]:
model.coef_

array([-0.21994306, -0.07107905,  0.11693755,  0.03169306, -0.00246008,
       -0.0613861 ,  1.08614386,  0.1728903 ])

In [35]:
pd.DataFrame({
    'var':feature_names,
    'coef':model.coef_
})

Unnamed: 0,var,coef
0,x0_Male,-0.219943
1,x1_Yes,-0.071079
2,x2_Lunch,0.116938
3,day_0,0.031693
4,day_1,-0.00246
5,day_2,-0.061386
6,"total, bill",1.086144
7,size,0.17289
