In [2]:
# import Libraries
import sys
from sklearn import linear_model
from matplotlib import pyplot as plt
from math import sqrt
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, RidgeCV, LassoCV, LinearRegression
import numpy as np
import pandas as pd

In [3]:
# import/read the dodgers bobblehead data into a dataframe
# separate the dependent and indepedent variables as well
dodgers_train = pd.read_csv("dodgers_training.csv")
dodgers_train_X = dodgers_train[['month','day_of_week','temp','skies','bobblehead']]
dodgers_train_Y = dodgers_train[['attend']]
print(('dimTrain(X) = ', dodgers_train_X.shape),('dimTrain(Y) = ', dodgers_train_Y.shape))


# repeat the following for the test data
dodgers_test = pd.read_csv("dodgers_testing.csv")
dodgers_test_X = dodgers_test[['month','day_of_week','temp','skies','bobblehead']]
dodgers_test_Y = dodgers_test[['attend']]
print(('dimTest(X) = ', dodgers_test_X.shape),('dimTest(Y) = ', dodgers_test_Y.shape))

('dimTrain(X) = ', (56, 5)) ('dimTrain(Y) = ', (56, 1))
('dimTest(X) = ', (25, 5)) ('dimTest(Y) = ', (25, 1))


In [4]:
# ensure that both training and testing data have the same number of columns
# which will be necessary for using LOOCV (leave one out cross validation)
dodgers_train_X['training_data']=1
dodgers_test_X['training_data']=0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [5]:
# make sure previous code worked
dodgers_train_X.head(), dodgers_test_X.head()

(  month day_of_week  temp   skies bobblehead  training_data
 0   AUG      Monday    80  Clear          NO              1
 1   AUG    Saturday    70  Clear          NO              1
 2   AUG     Tuesday    80  Clear         YES              1
 3   APR      Monday    60  Cloudy         NO              1
 4   AUG   Wednesday    84  Clear          NO              1,
   month day_of_week  temp   skies bobblehead  training_data
 0   AUG   Wednesday    75  Clear          NO              0
 1   JUN    Saturday    68  Clear          NO              0
 2   AUG      Monday    79  Clear          NO              0
 3   JUN     Tuesday    66  Cloudy        YES              0
 4   JUL      Sunday    75  Clear         YES              0)

In [6]:
# combine train and test data now to double check they have the 
# same number of columns
dodgers_merged = pd.concat([dodgers_train_X, dodgers_test_X])
dodgers_merged

Unnamed: 0,month,day_of_week,temp,skies,bobblehead,training_data
0,AUG,Monday,80,Clear,NO,1
1,AUG,Saturday,70,Clear,NO,1
2,AUG,Tuesday,80,Clear,YES,1
3,APR,Monday,60,Cloudy,NO,1
4,AUG,Wednesday,84,Clear,NO,1
...,...,...,...,...,...,...
20,JUL,Wednesday,70,Clear,NO,0
21,JUN,Friday,72,Clear,NO,0
22,SEP,Monday,84,Cloudy,NO,0
23,JUL,Tuesday,70,Cloudy,NO,0


In [7]:
# perform OneHotEncoding on  the merged training and test data to convert 
# the catagorical data into a binary column for each category
# although temp is not catagorical data, it will still be included here for 
# organizational purposes
dodgers_merged_X_OHE = pd.get_dummies(dodgers_merged[['month','day_of_week','temp','skies','bobblehead', 'training_data']])
dodgers_merged_X_OHE.head()

Unnamed: 0,temp,training_data,month_APR,month_AUG,month_JUL,month_JUN,month_MAY,month_OCT,month_SEP,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,skies_Clear,skies_Cloudy,bobblehead_NO,bobblehead_YES
0,80,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0
1,70,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0
2,80,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1
3,60,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0
4,84,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0


In [8]:
# now split the merged OHE dataframe back 
# into their respective training and test dataframes
dodgers_train_X_OHE = dodgers_merged_X_OHE[dodgers_merged_X_OHE['training_data']==1]
dodgers_test_X_OHE = dodgers_merged_X_OHE[dodgers_merged_X_OHE['training_data']==0]

In [9]:
# drop the 'training_data' columns  from
# both training and testing dataframes as they are no longer needed
dodgers_train_X_OHE.drop(columns=['training_data'],inplace=True)
dodgers_test_X_OHE.drop(columns=['training_data'],inplace=True)

dodgers_train_X_OHE.head(),dodgers_test_X.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


(   temp  month_APR  month_AUG  month_JUL  month_JUN  month_MAY  month_OCT  \
 0    80          0          1          0          0          0          0   
 1    70          0          1          0          0          0          0   
 2    80          0          1          0          0          0          0   
 3    60          1          0          0          0          0          0   
 4    84          0          1          0          0          0          0   
 
    month_SEP  day_of_week_Friday  day_of_week_Monday  day_of_week_Saturday  \
 0          0                   0                   1                     0   
 1          0                   0                   0                     1   
 2          0                   0                   0                     0   
 3          0                   0                   1                     0   
 4          0                   0                   0                     0   
 
    day_of_week_Sunday  day_of_week_Thursday  day_of_w

# 1. List the means and standard deviations of the encoded attributes before scaling.

In [10]:
# list the means and standard deviations before scaling
print('original means = ', np.mean(dodgers_train_X_OHE, axis=0))
print('original stds = ', np.std(dodgers_train_X_OHE, axis=0))

original means =  temp                     73.642857
month_APR                 0.160714
month_AUG                 0.214286
month_JUL                 0.089286
month_JUN                 0.089286
month_MAY                 0.232143
month_OCT                 0.017857
month_SEP                 0.196429
day_of_week_Friday        0.178571
day_of_week_Monday        0.142857
day_of_week_Saturday      0.160714
day_of_week_Sunday        0.196429
day_of_week_Thursday      0.089286
day_of_week_Tuesday       0.089286
day_of_week_Wednesday     0.142857
skies_Clear               0.803571
skies_Cloudy              0.196429
bobblehead_NO             0.910714
bobblehead_YES            0.089286
dtype: float64
original stds =  temp                     8.868944
month_APR                0.367267
month_AUG                0.410326
month_JUL                0.285156
month_JUN                0.285156
month_MAY                0.422200
month_OCT                0.132432
month_SEP                0.397296
day_of_week_F

In [11]:
# standardize and scale the training data
scaler = StandardScaler()
dodgers_train_X_scaled = scaler.fit_transform(dodgers_train_X_OHE)
dodgers_train_X_scaled

array([[ 0.71678692, -0.43759497,  1.91485422, ..., -0.49441323,
         0.31311215, -0.31311215],
       [-0.41074307, -0.43759497,  1.91485422, ..., -0.49441323,
         0.31311215, -0.31311215],
       [ 0.71678692, -0.43759497,  1.91485422, ..., -0.49441323,
        -3.19374388,  3.19374388],
       ...,
       [ 2.40808189, -0.43759497, -0.52223297, ..., -0.49441323,
         0.31311215, -0.31311215],
       [-0.74900206,  2.2852182 , -0.52223297, ..., -0.49441323,
         0.31311215, -0.31311215],
       [-0.29799007, -0.43759497, -0.52223297, ..., -0.49441323,
         0.31311215, -0.31311215]])

In [12]:
# list the means and standard deviations after standardizing and scaling
print('scaled means = ', np.mean(dodgers_train_X_scaled, axis=0))
print('scaled stds = ', np.std(dodgers_train_X_scaled, axis=0))

scaled means =  [ 4.75809868e-16 -6.34413157e-17  6.34413157e-17  1.58603289e-17
 -6.34413157e-17 -1.58603289e-17  1.58603289e-17  6.34413157e-17
 -3.17206578e-17 -2.37904934e-17 -7.93016446e-17  3.96508223e-17
 -7.93016446e-18  7.93016446e-18  6.34413157e-17 -6.34413157e-17
  3.96508223e-17  8.32667268e-17  1.58603289e-17]
scaled stds =  [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [13]:
# standardize and scale the testing data as well
scaler = StandardScaler()
dodgers_test_X_scaled = scaler.fit_transform(dodgers_test_X_OHE)

# 2. What is the best L2 regularization coefficient? Provide the corresponding linear coefficients.
### (The Q5 prompt is listed above each block of code where the answers to each part can be found)
#### Q5. For the models corresponding to the best L2 and L1 regularization coefficients list the following root-mean-square-error (RMSE): (1) RMSE on the training set, (2) expected prediction RMSE obtained during the cross-validation, (3) RMSE on the testing set. For the linear model without regularization list the following RMSE: (1) RMSE on the training set, and (2) RMSE on the testing set.

In [14]:
# train 100 L2-regularized linear models using LOOCV 
# note: the 100 regularized coef are evenly spaced between 0.1 and 1000
alpha_values = np.linspace(0.1,1000,100)

l2_cv = RidgeCV(cv=None, store_cv_values=True, alphas=alpha_values)
l2_cv.fit(dodgers_train_X_scaled, dodgers_train_Y)
l2_train_prediction = l2_cv.predict(dodgers_train_X_scaled)
l2_test_prediction = l2_cv.predict(dodgers_test_X_scaled)
print('alpha = ', l2_cv.alpha_)
print('coef = ', l2_cv.coef_)
print('R2 = ', l2_cv.score(dodgers_test_X_scaled,dodgers_test_Y))

# take the square root of the mean squared error value of 
# the testing and training models in order to get the RMSE
# (Root Mean Squared Error)
RMSE_l2_dodgers_train = sqrt(mean_squared_error(l2_train_prediction, dodgers_train_Y))
RMSE_l2_dodgers_test = sqrt(mean_squared_error(l2_test_prediction, dodgers_test_Y))
print('')
print('Q5.')
print('RMSE: l2 Train = ',RMSE_l2_dodgers_train)
print('RMSE: l2 Test = ',RMSE_l2_dodgers_test)

alpha =  50.6
coef =  [[  515.85380007  -273.56102806   457.54654532  -216.88248689
   1102.57100111  -605.71237164   -75.78927596  -186.41964828
     67.69322043  -835.67182144   490.25315842   238.14165655
   -341.53918562  1070.28517076  -617.19884398   271.19412682
   -271.19412682 -1145.58152754  1145.58152754]]
R2 =  0.10450004637534294

Q5.
RMSE: l2 Train =  5648.864216385291
RMSE: l2 Test =  7797.679361979969


### Q5. Part 2

In [20]:
# print the shape of the variable: alpha_values
print(l2_cv.cv_values_.shape)

# print the mean of the alpha values as well as the Standard Errors
print('means = ', l2_cv.cv_values_.mean(axis=0))
print('')
print('SEs = ', l2_cv.cv_values_.std(axis=0)/np.sqrt(l2_cv.cv_values_.shape[0]))

(56, 1, 100)
means =  [[59421605.36360309 50057999.89681771 47141951.65783965 45941075.18931566
  45480760.60034025 45396075.06151689 45518075.38288169 45758564.93222325
  46067745.84673562 46415936.10002929 46784758.61315224 47162536.10362399
  47541728.41189262 47917433.56636014 48286474.91597056 48646826.44408111
  48997240.90851851 49337003.6610212  49665766.53720655 49983433.9898814
  50290084.01651955 50585912.67451512 50871194.83438752 51146256.26039305
  51411453.68354893 51667160.56899668 51913756.97363044 52151622.36147111
  52381130.56907384 52602646.33974872 52816523.0050727  53023101.00591195
  53222707.0269184  53415653.5776953  53602238.89706362 53782747.08862705
  53957448.4193129  54126599.73000534 54290444.92040291 54449215.47997818
  54603131.04424022 54752399.96101584 54897219.85562302 55037778.18695045
  55174252.78882567 55306812.3928389  55435617.13013334 55560819.01067696
  55682562.37927826 55800984.34816158 55916215.20631903 56028378.80614388
  56137592.928053

# 3. What is the best L1 regularization coefficient? Provide the corresponding linear coefficients.
# 4. What are the predictive attributes selected as a result of L1 regularization? 
### (listed underneath the outputs for Q3.)
#### Q5. For the models corresponding to the best L2 and L1 regularization coefficients list the following root-mean-square-error (RMSE): (1) RMSE on the training set, (2) expected prediction RMSE obtained during the cross-validation, (3) RMSE on the testing set. For the linear model without regularization list the following RMSE: (1) RMSE on the training set, and (2) RMSE on the testing set.

In [16]:
# perform the same operation in the block above, 
# but this time using L1 regularization 
l1_cv = LassoCV(cv=10, alphas=alpha_values)
l1_cv.fit(dodgers_train_X_scaled, dodgers_train_Y)
l1_train_prediction = l1_cv.predict(dodgers_train_X_scaled)
l1_test_prediction = l1_cv.predict(dodgers_test_X_scaled)
print('alpha = ', l1_cv.alpha_)
print('coef = ', l1_cv.coef_)
print('R2 = ', l1_cv.score(dodgers_test_X_scaled,dodgers_test_Y))

# take the square root of the mean squared error value of 
# the testing and training models in order to get the RMSE
# (Root Mean Squared Error)
RMSE_l1_dodgers_train = sqrt(mean_squared_error(l1_train_prediction, dodgers_train_Y))
RMSE_l1_dodgers_test = sqrt(mean_squared_error(l1_test_prediction, dodgers_test_Y))
print('')
print('Q5.')
print('RMSE: l1 Train = ',RMSE_l1_dodgers_train)
print('RMSE: l1 Test = ',RMSE_l1_dodgers_test)
print('')
print('Q4.')
print('Based on the coefficients given, the predictive features selected by l1 regularization are the following:')
print('Temp, month_AUG, month_JUL, month_MAY, day_of_week_Monday, day_of_week_Saturday, day_of_week_Thursday,')
print('day_of_week_Tuesday, day_of_week_Wednesday, bobblehead_NO, bobblehead_YES')

  y = column_or_1d(y, warn=True)
  tol, rng, random, positive)
  tol, rng, random, positive)


alpha =  555.6
coef =  [ 8.02939381e+02 -0.00000000e+00  6.97694599e+02 -0.00000000e+00
  1.85527604e+03 -9.17594703e+01 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -1.09069580e+03  1.48624769e+02  0.00000000e+00
 -3.67748055e+02  1.35270109e+03 -8.12915997e+02  0.00000000e+00
 -0.00000000e+00 -2.58129051e+03  8.44530795e-13]
R2 =  0.0730854604186496

Q5.
RMSE: l1 Train =  5517.737021138628
RMSE: l1 Test =  7933.273677823828

Q4.
Based on the coefficients given, the predictive features selected by l1 regularization are the following:
Temp, month_AUG, month_JUL, month_MAY, day_of_week_Monday, day_of_week_Saturday, day_of_week_Thursday,
day_of_week_Tuesday, day_of_week_Wednesday, bobblehead_NO, bobblehead_YES


  tol, rng, random, positive)
  tol, rng, random, positive)


# Q6. What observations can you make based on the above RMSE? 
### (explanation below the RMSE values for linear regression)


In [17]:
# perform a similar operation to the two methods above
# but this time using a linear model without regression
lin_reg = LinearRegression(fit_intercept=True).fit(dodgers_train_X_scaled, dodgers_train_Y)
lin_pred_train = lin_reg.predict(dodgers_train_X_scaled)
lin_pred_test = lin_reg.predict(dodgers_test_X_scaled)

# take the square root of the mean squared error value of 
# the testing and training models in order to get the RMSE
# (Root Mean Squared Error)
RMSE_lin_reg_train_prediction = sqrt(mean_squared_error(lin_pred_train, dodgers_train_Y))
RMSE_lin_reg_test_prediction = sqrt(mean_squared_error(lin_pred_test, dodgers_test_Y))
print('RMSE: linear regression wihtout regularization, Train = ',RMSE_lin_reg_train_prediction)
print('RMSE: linear regression wihtout regularization, Test = ',RMSE_lin_reg_test_prediction)
print('')
print('Based on the RMSE given by all three models on both the test and training data (6 total RMSE values) it can be determined that') 
print('for l2 and l1 models, the training data may be too flexible and therefore slightly overfitted based on the difference in the RMSE')
print('values, but overall the l2 model is superior becuase the difference between the training and testing RMSE is lower than the difference')
print('between the training and testing RMSE calculated from the l1 model (RMSE Difference [l1: 2415.53 > l2: 2148.82])')

RMSE: linear regression wihtout regularization, Train =  5265.924411340254
RMSE: linear regression wihtout regularization, Test =  2.3048531557943837e+17

Based on the RMSE given by all three models on both the test and training data (6 total RMSE values) it can be determined that
for l2 and l1 models, the training data may be too flexible and therefore slightly overfitted based on the difference in the RMSE
values, but overall the l2 model is superior becuase the difference between the training and testing RMSE is lower than the difference
between the training and testing RMSE calculated from the l1 model (RMSE Difference [l1: 2415.53 > l2: 2148.82])


In [18]:
# Predict the attendance on a clear Monday in June when the expected temperature
# is 72 for all three models with and without bobbleheads.

# created an array that replicates exactly the conditions that 
# are asked to be predicted from its binary values that were created
# through OneHotEncoding
Bobbleheads = np.array([[72,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1]])
No_Bobbleheads = np.array([[72,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0]])
# note, the double brackets ensures that the array is 2D instead
# of 1D in order for the predictions to 
Bobbleheads_scaled = scaler.fit_transform(Bobbleheads)
No_Bobbleheads_scaled = scaler.fit_transform(No_Bobbleheads)

## Q7. Predict the attendance on a clear Monday in June when the 
## expected temperature is 72 for all three models with and without bobbleheads. 
## Does bobblehead promotion have an impact on the attendance?
### (explanation is below attendance predictions)

In [19]:
# run the scenario listed above through all of the previously 
# created models with and without Bobbleheads to compare
# all of the outputs which will be the predicted attendance


Bobbleheads_l2 = l2_cv.predict(Bobbleheads_scaled)
No_Bobbleheads_l2 = l2_cv.predict(No_Bobbleheads_scaled)

Bobbleheads_l1 = l1_cv.predict(Bobbleheads_scaled)
No_Bobbleheads_l1 = l1_cv.predict(No_Bobbleheads_scaled)

Bobbleheads_lin_reg = lin_reg.predict(Bobbleheads_scaled)
No_Bobbleheads_lin_reg = lin_reg.predict(No_Bobbleheads_scaled)


print('Predictions WITH Bobblehead promotion')
print('Attendance prediction based on l2 model with Bobbleheads given to fans:',Bobbleheads_l2)
print('Attendance prediction based on l1 model with Bobbleheads given to fans::',Bobbleheads_l1)
print('Attendance prediction based on linear regression with Bobbleheads given to fans::',Bobbleheads_lin_reg)
print('')
print('Predictions WIHTOUT Bobblehead promotion')
print('Attendance prediction based on l2 model with NO Bobbleheads given to fans:',No_Bobbleheads_l2)
print('Attendance prediction based on l1 model with NO Bobbleheads given to fans::',No_Bobbleheads_l1)
print('Attendance prediction based on linear regression with NO Bobbleheads given to fans::',No_Bobbleheads_lin_reg)
print('')
print('Q7. Based on the output from all six predictions stemming from the three models, it can be determined')
print('that whether or not they decide to hold the bobblehead promotion, it will not have an impact on attendance.')
print('This was an intersting result considering the models, specifically the l1 model, included bobbleheads as ')
print('an important attribute in its prediction!')

Predictions WITH Bobblehead promotion
Attendance prediction based on l2 model with Bobbleheads given to fans: [[39664.80357143]]
Attendance prediction based on l1 model with Bobbleheads given to fans:: [39664.80357143]
Attendance prediction based on linear regression with Bobbleheads given to fans:: [[39681.82779728]]

Predictions WIHTOUT Bobblehead promotion
Attendance prediction based on l2 model with NO Bobbleheads given to fans: [[39664.80357143]]
Attendance prediction based on l1 model with NO Bobbleheads given to fans:: [39664.80357143]
Attendance prediction based on linear regression with NO Bobbleheads given to fans:: [[39681.82779728]]

Q7. Based on the output from all six predictions stemming from the three models, it can be determined
that whether or not they decide to hold the bobblehead promotion, it will not have an impact on attendance.
This was an intersting result considering the models, specifically the l1 model, included bobbleheads as 
an important attribute in its 