In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocessing   
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer # melakukan transformasi (fit transform = transformer)
import category_encoders as ce

# Model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Utilities
import warnings
warnings.filterwarnings("ignore")
from sklearn.utils.testing import ignore_warnings

# Study Case 1
aplikasikan feature engineering yang sudah dipelajari ke dalam model
* Part 1 = Ridge 
>* one hot encoding = sex, smoker, time
>* binary encoding  = day
>* robust scaler    = total_bil
>* no treatment     = size

In [2]:
tips = sns.load_dataset('tips')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


## Data Preprocessing

In [3]:
transformer = ColumnTransformer([ #tuple jangan lupa
    ('encoder',OneHotEncoder(drop='first'),['sex','smoker','time']),
    ('binary',ce.BinaryEncoder(),['day']),
    ('robust',RobustScaler(),['total_bill'])
],remainder='passthrough')

## Data Splitting

In [4]:
X=tips.drop('tip',axis=1)
y=tips['tip']

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                random_state=10)

## Data Transform

In [6]:
X_train_preprocessed = transformer.fit_transform(X_train)
X_test_preprocessed = transformer.transform(X_test)

Dikarekakan data sudah dilakukan preprocessing, maka perlu dilakukan penamaan pada kolom

In [7]:
X_train_preprocessed = pd.DataFrame(X_train_preprocessed)
X_test_preprocessed= pd.DataFrame(X_test_preprocessed)

In [8]:
transformer.transformers_[0][1]

OneHotEncoder(drop='first')

In [9]:
transformer.transformers_[0][1].get_feature_names()

array(['x0_Male', 'x1_Yes', 'x2_Lunch'], dtype=object)

In [10]:
transformer.transformers_[1][1]

BinaryEncoder()

In [11]:
transformer.transformers_[1][1].get_feature_names()

['day_0', 'day_1', 'day_2']

In [12]:
features = list(transformer.transformers_[0][1].get_feature_names()) + transformer.transformers_[1][1].get_feature_names() + ['total_bill','size']

In [13]:
features

['x0_Male',
 'x1_Yes',
 'x2_Lunch',
 'day_0',
 'day_1',
 'day_2',
 'total_bill',
 'size']

In [14]:
X_test_preprocessed.columns = features  #memberikan nama kolom sesuai denga isi fungsi features
X_train_preprocessed.columns = features #memberikan nama kolom sesuai denga isi fungsi features
X_train_preprocessed

Unnamed: 0,x0_Male,x1_Yes,x2_Lunch,day_0,day_1,day_2,total_bill,size
0,1.0,1.0,0.0,0.0,0.0,1.0,-0.566396,2.0
1,1.0,0.0,0.0,0.0,1.0,0.0,-0.647696,3.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.316170,3.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.245709,2.0
4,1.0,1.0,0.0,0.0,1.0,0.0,2.081301,2.0
...,...,...,...,...,...,...,...,...
178,1.0,0.0,0.0,0.0,0.0,1.0,0.007227,3.0
179,1.0,0.0,0.0,0.0,1.0,0.0,0.367660,2.0
180,1.0,0.0,0.0,0.0,0.0,1.0,-0.382114,2.0
181,0.0,0.0,1.0,0.0,1.0,1.0,1.110208,6.0


## Model Fitting and Evaluation

In [15]:
model = Ridge()
model.fit(X_train_preprocessed,y_train)

Ridge()

In [16]:
y_pred = model.predict(X_test_preprocessed)
print('mse :',mean_squared_error(y_test,y_pred))

mse : 1.0577456219830779


In [17]:
coef_table = pd.DataFrame({'features':features,'coef':model.coef_})

In [18]:
coef_table

Unnamed: 0,features,coef
0,x0_Male,-0.219943
1,x1_Yes,-0.071079
2,x2_Lunch,0.116938
3,day_0,0.031693
4,day_1,-0.00246
5,day_2,-0.061386
6,total_bill,1.086144
7,size,0.17289


# Conclusion
* Pria memberikan tips 0.21 lebih rendah dari wanita
* Smoker memberikan tips lebih rendah 0.07 dibandingkan non smoker
* Pelanggan yang makan siang memberikan tips lebih banyak 0.11 daripada Pelanggan yang makan malam
* total bill setiap naik 1, tips yang diberikan naik 1.08 dollar
* size setiap naik 1,  tips naik 0.17 dollar