In [1]:
# Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn import linear_model
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [2]:
# Load dataset
data = sns.load_dataset("tips")
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
# Partition dataset into training, validation, and test sets using holdout method
X_train, X_test, Y_train, Y_test = train_test_split(data[['total_bill', 'size', 'sex', 'smoker', 'day', 'time']], data['tip'], test_size = 0.2, random_state = 1)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.2, random_state = 1)

In [4]:
# Explore training set
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156 entries, 112 to 84
Data columns (total 6 columns):
total_bill    156 non-null float64
size          156 non-null int64
sex           156 non-null category
smoker        156 non-null category
day           156 non-null category
time          156 non-null category
dtypes: category(4), float64(1), int64(1)
memory usage: 4.7 KB
None


In [5]:
# Explore validation set
print(X_val.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39 entries, 64 to 79
Data columns (total 6 columns):
total_bill    39 non-null float64
size          39 non-null int64
sex           39 non-null category
smoker        39 non-null category
day           39 non-null category
time          39 non-null category
dtypes: category(4), float64(1), int64(1)
memory usage: 1.5 KB
None


In [6]:
# Explore test set
print(X_test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49 entries, 67 to 29
Data columns (total 6 columns):
total_bill    49 non-null float64
size          49 non-null int64
sex           49 non-null category
smoker        49 non-null category
day           49 non-null category
time          49 non-null category
dtypes: category(4), float64(1), int64(1)
memory usage: 1.8 KB
None


In [7]:
# Evaluate model with two predictors without test data
model = linear_model.LinearRegression().fit(X = X_train[['total_bill', 'size']], y = Y_train)
score_train = model.score(X = X_train[['total_bill', 'size']], y = Y_train) # R squared (training)
score_val = model.score(X = X_val[['total_bill', 'size']], y = Y_val) # R squared (validation)
score_test = model.score(X = X_test[['total_bill', 'size']], y = Y_test) # R squared (test)
print([score_train, score_val])

[0.4766128300729817, 0.2680741934173284]


In [8]:
# Evaluate model with all predictors without test data
X_train_dummy = pd.get_dummies(X_train, drop_first = True)
X_val_dummy = pd.get_dummies(X_val, drop_first = True)
X_test_dummy = pd.get_dummies(X_test, drop_first = True)
model = linear_model.LinearRegression().fit(X = X_train_dummy, y = Y_train)
score_train = model.score(X = X_train_dummy, y = Y_train) # R squared (training)
score_val = model.score(X = X_val_dummy, y = Y_val) # R squared (validation)
score_test = model.score(X = X_test_dummy, y = Y_test) # R squared (test)
print([score_train, score_val])

[0.4984906168226673, 0.1500730876358044]


In [9]:
# The model with higher R2 for validation set of the model with two predictors is better

In [10]:
# Evaluate model with two predictors
model = linear_model.LinearRegression().fit(X = X_train[['total_bill', 'size']], y = Y_train)
score_train = model.score(X = X_train[['total_bill', 'size']], y = Y_train) # R squared (training)
score_val = model.score(X = X_val[['total_bill', 'size']], y = Y_val) # R squared (validation)
score_test = model.score(X = X_test[['total_bill', 'size']], y = Y_test) # R squared (test)
print([score_train, score_val, score_test])

[0.4766128300729817, 0.2680741934173284, 0.5025467454308667]


In [11]:
# Evaluate model with all predictors
X_train_dummy = pd.get_dummies(X_train, drop_first = True)
X_val_dummy = pd.get_dummies(X_val, drop_first = True)
X_test_dummy = pd.get_dummies(X_test, drop_first = True)
model = linear_model.LinearRegression().fit(X = X_train_dummy, y = Y_train)
score_train = model.score(X = X_train_dummy, y = Y_train) # R squared (training)
score_val = model.score(X = X_val_dummy, y = Y_val) # R squared (validation)
score_test = model.score(X = X_test_dummy, y = Y_test) # R squared (test)
print([score_train, score_val, score_test])

[0.4984906168226673, 0.1500730876358044, 0.45740339690612053]


In [12]:
# its expected that the R2 value for the test set of the model with two predictors would be higher as the R2 value for its validation set is higher

In [13]:
# Evaluate ridge regression model
model = linear_model.Ridge(alpha = 1).fit(X = X_train_dummy, y = Y_train)
print(model.coef_)
print(model.intercept_)
score_train = model.score(X = X_train_dummy, y = Y_train) # R squared (training)
score_val = model.score(X = X_val_dummy, y = Y_val) # R squared (validation)
score_test = model.score(X = X_test_dummy, y = Y_test) # R squared (test)
print([score_train, score_val, score_test])

[ 0.08463047  0.22815647  0.16745141  0.33087101  0.25928626  0.16275647
 -0.09618461 -0.0213254 ]
0.35924435448807257
[0.4979572371027402, 0.15475384204753373, 0.46801325469326427]


In [14]:
# Evaluate LASSO regression model
model = linear_model.Lasso(alpha = 1).fit(X = X_train_dummy, y = Y_train)
print(model.coef_)
print(model.intercept_)
score_train = model.score(X = X_train_dummy, y = Y_train) # R squared (training)
score_val = model.score(X = X_val_dummy, y = Y_val) # R squared (validation)
score_test = model.score(X = X_test_dummy, y = Y_test) # R squared (test)
print([score_train, score_val, score_test])

[ 0.0833546  0.         0.         0.        -0.         0.
 -0.         0.       ]
1.2672429703113126
[0.44690685270800645, 0.24517250610740762, 0.48653991019691134]


In [15]:
# Evaluate elastic net model
model = linear_model.ElasticNet(alpha = 1, l1_ratio = 0.5).fit(X = X_train_dummy, y = Y_train)
print(model.coef_)
print(model.intercept_)
score_train = model.score(X = X_train_dummy, y = Y_train) # R squared (training)
score_val = model.score(X = X_val_dummy, y = Y_val) # R squared (validation)
score_test = model.score(X = X_test_dummy, y = Y_test) # R squared (test)
print([score_train, score_val, score_test])

[ 0.08868425  0.          0.          0.         -0.          0.
 -0.          0.        ]
1.1600009252621706
[0.4517422247647733, 0.24602004020141358, 0.5058380902878925]


In [16]:
# Question 01: Using the holdout method, which model has the best performance?

In [17]:
# Partition dataset into training and test sets using 5-fold cross-validation
folds = KFold(n_splits = 5, shuffle = False)
for train_index, test_index in folds.split(data):
    print([train_index.shape[0], test_index.shape[0]])

[195, 49]
[195, 49]
[195, 49]
[195, 49]
[196, 48]


In [18]:
# Evaluate model with two predictors
model = linear_model.LinearRegression()
scores = cross_val_score(model, X = data[['total_bill', 'size']], y = data['tip'], cv = folds)
print(scores)
print([scores.mean(), scores.std()])

[0.53404571 0.61085627 0.51123501 0.25355216 0.41897975]
[0.465733781303872, 0.12247807667862953]


In [19]:
# Evaluate model with all predictors
data_dummy = pd.get_dummies(data[['total_bill', 'size', 'sex', 'smoker', 'day', 'time']], drop_first = True)
model = linear_model.LinearRegression()
scores = cross_val_score(model, X = data_dummy, y = data['tip'], cv = folds)
print(scores)
print([scores.mean(), scores.std()])

[0.52819606 0.59245311 0.40773384 0.20031853 0.37843167]
[0.42142664059440527, 0.13533264127084232]


In [20]:
# Evaluate ridge regression model
model = linear_model.Ridge(alpha = 1)
scores = cross_val_score(model, X = data_dummy, y = data['tip'], cv = folds)
print(scores)
print([scores.mean(), scores.std()])

[0.52856008 0.59381419 0.4110844  0.20285942 0.3799312 ]
[0.4232498560234852, 0.13474337569568162]


In [21]:
# Evaluate LASSO regression model
model = linear_model.Lasso(alpha = 1)
scores = cross_val_score(model, X = data_dummy, y = data['tip'], cv = folds)
print(scores)
print([scores.mean(), scores.std()])

[0.51231329 0.59998849 0.47054693 0.27016128 0.39496651]
[0.4495953011995966, 0.11150919601791943]


In [22]:
# Evaluate elastic net model
model = linear_model.ElasticNet(alpha = 1, l1_ratio = 0.5)
scores = cross_val_score(model, X = data_dummy, y = data['tip'], cv = folds)
print(scores)
print([scores.mean(), scores.std()])

[0.53054017 0.61071783 0.47411113 0.25806837 0.39915932]
[0.4545193629248307, 0.12018210339870257]


In [None]:
# Question 02: Using the cross-validation method, which model has the best performance?

In [None]:
from sklearn.model_selection import cross_val_score