# Model Selection: Leave-One-Out

#### Jessica Ji

In [3]:
import numpy as np
import scipy
import datetime as dt
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict, LeaveOneOut
from sklearn.metrics import r2_score

## The Data: Bike Sharing

In [10]:
bike = pd.read_csv(('data/Bike-Sharing-Dataset/day.csv'))

# reformat the date column to integers representing the day of the year, 001-366
bike['dteday'] = pd.to_datetime(bike['dteday']).dt.strftime('%j')

# # get rid of the index column
bike = bike.drop("instant", axis=1)

# the features used to predict riders
X = bike.drop(columns=['casual', 'registered', 'cnt'])

# the number of riders
y = bike['cnt']

# number of rows
n = len(X)

## 1. Leave-One-Out (LOO)

In [5]:
loo = LeaveOneOut()

### LOO: Linear

In [15]:
# Store the y-test and y-predicted datasets for each split
y_test_sets = []
y_pred_sets = []

# each iteration will have one test index and n-1 train indices
for train_index, test_index in loo.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # construct and fit model for each training set
    model = LinearRegression()
    model.fit(X_train, y_train) 
    
    # make prediction
    y_pred = model.predict(X_test)
        
    # add to lists of test y and predicted y    
    y_test_sets += list(y_test)
    y_pred_sets += list(y_pred)
        
# calculate r-squared error metric
r2 = r2_score(y_test_sets, y_pred_sets)
print("Linear LOO r-squared: " + str(r2))

Linear LOO r-squared: 0.7895324245718587


### LOO: Ridge

In [16]:
# Store the y-test and y-predicted datasets for each split
y_test_sets = []
y_pred_sets = []

# each iteration will have one test index and n-1 train indices
for train_index, test_index in loo.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # construct and fit model for each training set
    model = Ridge()
    model.fit(X_train, y_train) 
    
    # make prediction
    y_pred = model.predict(X_test)
        
    # add to lists of test y and predicted y    
    y_test_sets += list(y_test)
    y_pred_sets += list(y_pred)
        
# calculate r-squared error metric
r2 = r2_score(y_test_sets, y_pred_sets)
print("Ridge LOO r-squared: " + str(r2))

Ridge LOO r-squared: 0.7932464176720754


### LOO: LASSO

In [17]:
# Store the y-test and y-predicted datasets for each split
y_test_sets = []
y_pred_sets = []

# each iteration will have one test index and n-1 train indices
for train_index, test_index in loo.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # construct and fit model for each training set
    model = Lasso()
    model.fit(X_train, y_train) 
    
    # make prediction
    y_pred = model.predict(X_test)
        
    # add to lists of test y and predicted y    
    y_test_sets += list(y_test)
    y_pred_sets += list(y_pred)
        
# calculate r-squared error metric
r2 = r2_score(y_test_sets, y_pred_sets)
print("Lasso LOO r-squared: " + str(r2))

Lasso LOO r-squared: 0.7917513496227154


## 2. Cross-Validation

In [19]:
# set the random seed

np.random.seed(10)

# split the data
# train_test_split returns 4 values: X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.80, test_size=0.20)

# split the data
# Returns 4 values: X_train, X_validate, y_train, y_validate

X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train,
                                                    train_size=0.75, test_size=0.25)

In [20]:
# Linear Regression

## Create lin_reg method and fit model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Ridge

## Create ridge_reg method and fit model
ridge_reg = Ridge()
ridge_reg.fit(X_train, y_train)

# LASSO

## Create lasso_reg and fit
lasso_reg = Lasso(max_iter=10000)  
lasso_model = lasso_reg.fit(X_train, y_train)

In [21]:
lin_predicted = cross_val_predict(lin_reg, X_train, y_train, cv=3)
lin_r2 = r2_score(lin_predicted, y_train)
print("Linear CV r-squared: " + str(lin_r2))

Linear CV r-squared: 0.7414435356524711


In [22]:
ridge_predicted = cross_val_predict(ridge_reg, X_train, y_train, cv=3)
ridge_r2 = r2_score(ridge_predicted, y_train)
print("Ridge CV r-squared: " + str(ridge_r2))

Ridge CV r-squared: 0.7206179781527943


In [23]:
lasso_predicted = cross_val_predict(lasso_reg, X_train, y_train, cv=3)
lasso_r2 = r2_score(lasso_predicted, y_train)
print("LASSO CV r-squared: " + str(lasso_r2))

LASSO CV r-squared: 0.7376698155672519
