In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error

In [2]:
df = sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


data : tips

target : tip

preprocess:
1. one hot encoding : sex, smoker, time
2. binary encoding : day
3. robust scaler : total_bill
4. no treatment : size
Random state 10, data splitting 70:30 model Ridge default


In [3]:
enc_df = pd.get_dummies(df[['sex', 'smoker', 'time', 'day']], drop_first=True)
enc_df

Unnamed: 0,sex_Female,smoker_No,time_Dinner,day_Fri,day_Sat,day_Sun
0,True,True,True,False,False,True
1,False,True,True,False,False,True
2,False,True,True,False,False,True
3,False,True,True,False,False,True
4,True,True,True,False,False,True
...,...,...,...,...,...,...
239,False,True,True,False,True,False
240,True,False,True,False,True,False
241,False,False,True,False,True,False
242,False,True,True,False,True,False


In [4]:
clean_df = pd.concat([df[['total_bill', 'tip', 'size']], enc_df], axis = 1)
clean_df

Unnamed: 0,total_bill,tip,size,sex_Female,smoker_No,time_Dinner,day_Fri,day_Sat,day_Sun
0,16.99,1.01,2,True,True,True,False,False,True
1,10.34,1.66,3,False,True,True,False,False,True
2,21.01,3.50,3,False,True,True,False,False,True
3,23.68,3.31,2,False,True,True,False,False,True
4,24.59,3.61,4,True,True,True,False,False,True
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,False,True,True,False,True,False
240,27.18,2.00,2,True,False,True,False,True,False
241,22.67,2.00,2,False,False,True,False,True,False
242,17.82,1.75,2,False,True,True,False,True,False


In [5]:
X = clean_df.drop(columns='tip')
y = clean_df['tip']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [7]:
rb = RobustScaler()
rb.fit(X_train[['total_bill']])

In [8]:
X_train['total_bill'] = rb.transform(X_train[['total_bill']])
X_test['total_bill'] = rb.transform(X_test[['total_bill']])

In [9]:
X_train

Unnamed: 0,total_bill,size,sex_Female,smoker_No,time_Dinner,day_Fri,day_Sat,day_Sun
69,-0.254450,2,False,False,True,False,True,False
79,-0.054054,2,False,True,False,False,False,False
8,-0.251813,2,False,True,True,False,False,True
105,-0.223687,2,False,False,True,False,True,False
230,0.536585,4,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...
145,-0.839815,2,True,True,False,False,False,False
14,-0.270270,2,True,True,True,False,False,True
206,0.763349,3,False,False,True,False,True,False
7,0.788838,4,False,True,True,False,False,True


In [10]:
X_train.describe()

Unnamed: 0,total_bill,size
count,170.0,170.0
mean,0.172694,2.529412
std,0.783991,0.88496
min,-1.303889,1.0
25%,-0.404526,2.0
50%,0.0,2.0
75%,0.595474,3.0
max,2.674138,6.0


In [11]:
X_test.describe()

Unnamed: 0,total_bill,size
count,74.0,74.0
mean,0.148384,2.662162
std,0.783996,1.088886
min,-1.068337,1.0
25%,-0.382773,2.0
50%,-0.065041,2.0
75%,0.353329,3.0
max,2.892112,6.0


In [12]:
ridge = Ridge() 

In [13]:
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

In [14]:
mean_squared_error(y_pred, y_test)

1.233079189092444