In [1]:
# Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn import linear_model
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [2]:
# Load dataset
data = sns.load_dataset("tips")
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
# Partition dataset into training, validation, and test sets using holdout method
X_train1, X_test, Y_train1, Y_test = train_test_split(data[['total_bill', 'size', 'sex', 'smoker', 'day', 'time']], data['tip'], test_size = 0.1, random_state = 1)
X_train, X_val, Y_train, Y_val = train_test_split(X_train1, Y_train1, test_size = 0.2, random_state = 1)

In [4]:
# Explore training set
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175 entries, 42 to 169
Data columns (total 6 columns):
total_bill    175 non-null float64
size          175 non-null int64
sex           175 non-null category
smoker        175 non-null category
day           175 non-null category
time          175 non-null category
dtypes: category(4), float64(1), int64(1)
memory usage: 5.3 KB
None


In [5]:
# # Explore validation set
# print(X_val.info())

In [6]:
# # Explore test set
# print(X_test.info())

In [7]:
X_train_dummy1 = pd.get_dummies(X_train1, drop_first = True)
X_train_dummy = pd.get_dummies(X_train, drop_first = True)
X_val_dummy = pd.get_dummies(X_val, drop_first = True)
X_test_dummy = pd.get_dummies(X_test, drop_first = True)

In [8]:
# Evaluate model with all predictors

model = linear_model.LinearRegression().fit(X = X_train_dummy1, y = Y_train1)
score_train = model.score(X = X_train_dummy1, y = Y_train1) # R squared (training)
score_test = model.score(X = X_test_dummy, y = Y_test) # R squared (test)
print([score_train, score_test])

[0.45710450571317474, 0.48046458527769653]


In [9]:
# Evaluate Ridge model
model = linear_model.Ridge(alpha = 1).fit(X = X_train_dummy1, y = Y_train1)
print(model.coef_)
print(model.intercept_)
score_train = model.score(X = X_train_dummy1, y = Y_train1) # R squared (training)
score_test = model.score(X = X_test_dummy, y = Y_test) # R squared (test)
print([score_train,score_test])

[ 0.08597291  0.20341347  0.07787275  0.15903013  0.138513    0.11432336
  0.12431923 -0.12839129]
0.6074635821923469
[0.4569212652667636, 0.4831945651610725]


In [10]:
# Evaluate Ridge model
model = linear_model.Ridge(alpha = 1)
scores = cross_val_score(model, X = X_train_dummy1, y = Y_train1, cv = 5)
print(scores)
print([scores.mean(), scores.std()])
model.fit(X_train_dummy1, Y_train1)
scores_test = model.score(X_test_dummy, Y_test)
print(scores_test)

[0.31954276 0.34568096 0.17088526 0.56029296 0.53194506]
[0.3856694003812627, 0.14421788433670443]
0.4831945651610725
