# Cross-Validation

In [3]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split


## Recap

In [4]:
data = load_diabetes(return_X_y=False, as_frame=True, scaled=False)

In [5]:
X = data.data.values
y = data.target.values

In [6]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3, include_bias=False, interaction_only=True)
Xp = poly.fit_transform(X)

In [7]:
Xtr, Xte, ytr, yte = train_test_split(Xp,y, test_size=0.2)

In [8]:
lr = LinearRegression()
lr.fit(Xtr, ytr)
print ("test score", lr.score(Xte, yte))
print ("train score", lr.score(Xtr, ytr))

test score -0.4749104363367125
train score 0.7390873157479252


In [9]:
lasso = Lasso(alpha=0.005)
lasso.fit(Xtr, ytr)
lasso.score(Xte, yte)

  model = cd_fast.enet_coordinate_descent(


0.4004049390894858

In [10]:
ridge = Ridge(alpha=1.)
ridge.fit(Xtr, ytr)
ridge.score(Xte, yte)

-0.37693153358228626

## Cross Validation 

In [11]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import numpy as np

In [12]:
ridge= Ridge(alpha=0.5)
lasso= Lasso(alpha=0.5)

In [13]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lasso, Xtr,ytr, cv=5,scoring='neg_mean_squared_error')
print(f'cv score with lasso: {scores}')
scores = cross_val_score(ridge, Xtr,ytr, cv=5,scoring='neg_mean_squared_error')
print(f'cv score with ridge: {scores}')

cv score with lasso: [-2845.36749636 -4348.42203323 -3275.75556595 -3219.11760173
 -4341.33177082]
cv score with ridge: [ -9297.64197141 -31029.01583544 -21474.04272353 -10362.60645306
  -8717.33969828]


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Try Lasso regression, explicitly _without_ normalization first.
- What do you observe? 
- What are your data ranges?

Try to implement the Lasso with normalization with an sklearn-pipeline.

In [14]:
# without normalization

In [15]:
'''lasso_history=[]
ridge_history=[]
for i in range(10):
    lasso_scores = cross_val_score(Lasso(1+i/10,max_iter=1000), Xtr,ytr, cv=5,scoring='neg_mean_squared_error')
    lasso_history.append(lasso_scores)
    ridge_scores = cross_val_score(Ridge(1+i/10,max_iter=1000), Xtr,ytr, cv=5,scoring='neg_mean_squared_error')
    ridge_history.append(ridge_scores)
    '''

"lasso_history=[]\nridge_history=[]\nfor i in range(10):\n    lasso_scores = cross_val_score(Lasso(1+i/10,max_iter=1000), Xtr,ytr, cv=5,scoring='neg_mean_squared_error')\n    lasso_history.append(lasso_scores)\n    ridge_scores = cross_val_score(Ridge(1+i/10,max_iter=1000), Xtr,ytr, cv=5,scoring='neg_mean_squared_error')\n    ridge_history.append(ridge_scores)\n    "

In [16]:
# with normalization 
Xtr=StandardScaler().fit_transform(Xtr)
Xte=StandardScaler().fit_transform(Xte)

In [17]:
model_1=Lasso(0.25).fit(Xtr,ytr)
model_2=Ridge(0.25).fit(Xtr,ytr)

  model = cd_fast.enet_coordinate_descent(


In [18]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_1, X,y, cv=5,scoring='neg_mean_squared_error')
print(f'cv score with lasso: {scores}')
scores = cross_val_score(model_2, X,y, cv=5,scoring='neg_mean_squared_error')
print(f'cv score with ridge: {scores}')

cv score with lasso: [-2816.88184973 -3035.66226349 -3205.28464847 -2997.49053227
 -2929.3856928 ]
cv score with ridge: [-2783.95794077 -3029.36496978 -3232.41007435 -3006.65011161
 -2913.11719587]


In [19]:
# pipelines
from sklearn.pipeline import Pipeline

pipeline= Pipeline([
    ('scaler',  ()),          
    ('classifier', LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print(f'score:{score}')


score:0.4526027629719196


#### Main learning
CV makes sense to avoid picking by chance a validation dataset that has a very high or very low error. 
By averaging over many ways of picking the validation dataset we get a more honest estimate of the generalization error.

#### Typical errors while doing CV
- performing steps on the _whole_ dataset and _then_ doing CV
- always peform all modeling operations within the CV split (ignoring the validation dataset of each split)



## References

- Trevor Hastie, Robert Tibshirani, Jerome Friedman, The Elements of Statistical Learning, 2nd ed., Springer New York, NY (2009)
- Andrew Ng, CS229 Lecture Notes, updated by Tenguye Ma, Standord, Spring 2022
https://cs229.stanford.edu/lectures-spring2022/main_notes.pdf

- https://scikit-learn.org/stable/modules/cross_validation.html#k-fold