# Cross Validation

In [16]:
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

## cross_val_score

In [2]:
X, y = make_blobs(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

print(f"Test set score: {logreg.score(X_test, y_test):.2f}")

Test set score: 0.88


교차검증을 쓸 때는 train_test_split()이 필요 없음.

In [4]:
X, y = make_blobs(random_state=0)
logreg = LogisticRegression(max_iter=1000)

scores = cross_val_score(logreg, X, y)
print("Cross-validation score:", scores)
print(f"Average cross-validation score: {scores.mean():.2f}")

Cross-validation score: [0.9  0.9  0.9  0.8  0.95]
Average cross-validation score: 0.89


cv: default = 5\
세트의 개수를 10개로 늘려보자.

In [5]:
scores = cross_val_score(logreg, X, y, cv=10)
print("Cross-validation score:", scores)
print(f"Average cross-validation score: {scores.mean():.2f}")

Cross-validation score: [0.8 1.  0.9 0.9 0.9 0.8 0.9 0.9 0.9 1. ]
Average cross-validation score: 0.90


In [6]:
cross_val_score?

[1;31mSignature:[0m
[0mcross_val_score[0m[1;33m([0m[1;33m
[0m    [0mestimator[0m[1;33m,[0m[1;33m
[0m    [0mX[0m[1;33m,[0m[1;33m
[0m    [0my[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mgroups[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mscoring[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcv[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mn_jobs[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mverbose[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mfit_params[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mpre_dispatch[0m[1;33m=[0m[1;34m'2*n_jobs'[0m[1;33m,[0m[1;33m
[0m    [0merror_score[0m[1;33m=[0m[0mnan[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Evaluate a score by cross-validation.

Read more in the :ref:`User Guide <cross_validation>`.

Parameters
----------
est

## cross_validate

In [8]:
iris = load_iris()
logreg = LogisticRegression(max_iter=1000)

scores = cross_val_score(logreg, iris.data, iris.target)
print("Cross-validation score:", scores)

Cross-validation score: [0.96666667 1.         0.93333333 0.96666667 1.        ]


In [14]:
res = cross_validate(logreg, iris.data, iris.target, return_train_score=True)
res_df = pd.DataFrame(res)
display(res_df)
print("Average Time and Score:", res_df.mean(), sep='\n')

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.015575,0.0,0.966667,0.966667
1,0.018311,0.001009,1.0,0.966667
2,0.010017,0.000992,0.933333,0.983333
3,0.012088,0.0,0.966667,0.983333
4,0.008058,0.000999,1.0,0.975


Average Time and Score:
fit_time       0.012810
score_time     0.000600
test_score     0.973333
train_score    0.975000
dtype: float64


## cross_val_predict

In [17]:
cross_val_predict(logreg, iris.data, iris.target)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

***

데이터가 정렬된 경우라면 어떨까? 순서대로 train set를 자르면 조짐. 어떤 거는 0만, 어떤 거는 1만, 어떤 거는 2만 들어가있을 수도 있으니 점수가 개박살남.

그래서 적절히 섞어주는 작업이 필요함. 사실 섞어준다기보단 stratify(비율 맞춰서 할당) 해주는 작업임. 이럴 때 stratified k-fold가 필요. 이건 분류 작업에서 많이 사용함.

In [19]:
iris = load_iris()
print("Iris lable:", iris.target, sep='\n')

kfold = KFold(n_splits=3, shuffle=False)
print("Cross Validation Sore:", cross_val_score(logreg, iris.data, iris.target, cv=kfold), sep='\n')

Iris lable:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
Cross Validation Sore:
[0. 0. 0.]


In [20]:
kfold = StratifiedKFold(n_splits=3) #kfold = StratifiedKFold()

print("Cross-validation score:",
    cross_val_score(logreg, iris.data, iris.target, cv=kfold),
    sep='\n')

Cross-validation score:
[0.98 0.96 0.98]


## K-Fold Adjustment

### Cross Validation Splitter

In [21]:
kfold = KFold(n_splits=5)
print("Cross Validation Sore:", cross_val_score(logreg, iris.data, iris.target, cv=kfold), sep='\n')

Cross Validation Sore:
[1.         1.         0.86666667 0.93333333 0.83333333]


In [22]:
kfold = KFold(n_splits=3)
print("Cross Validation Sore:", cross_val_score(logreg, iris.data, iris.target, cv=kfold), sep='\n')

Cross Validation Sore:
[0. 0. 0.]


In [23]:
kfold = KFold(n_splits=3, shuffle=True, random_state=0)
#kfold = StratifiedKFold(n_splits=3) <= NO!!!!!

print("Cross Validation Sore:", cross_val_score(logreg, iris.data, iris.target, cv=kfold), sep='\n')

Cross Validation Sore:
[0.98 0.96 0.96]


## LOOCV
Leave-One-Out Cross-Validation\
각각 데이터를 모두 쪼개서 사용하는 방법. 작은 데이터의 경우 효율이 좋음. 시간이 많이 걸림.

In [24]:
loocv = LeaveOneOut()

scores = cross_val_score(logreg, iris.data, iris.target, cv=loocv)
print("Number of cross validation splits: ", len(scores)) # 각각의 score가 담긴 array
print(f"Average score: {scores.mean():.2f}")

print(len(iris.data))

Number of cross validation splits:  150
Average score: 0.97
150


## Shuffle-split Cross-Validation
모든 데이터셋을 쓰지 않음. 일부를 Test set, 일부를 Train set으로 사용하고 나머지는 사용하지 않음. 다음 회차에서는 또 다른 세트를 사용하지 않고..

In [25]:
shuffle_split = ShuffleSplit(test_size=.5, train_size=.5, n_splits=10)

scores = cross_val_score(logreg, iris.data, iris.target, cv=shuffle_split)
print("Cross-validation score:\n", scores)

Cross-validation score:
 [0.96       0.96       0.96       0.98666667 0.96       0.96
 0.96       0.94666667 0.97333333 0.93333333]


## Cross-Validation w/ Groups
train set을 그룹으로 정함. (수가 모두 같을 필요가 없음)

In [26]:
X, y = make_blobs(n_samples=12, random_state=0)

groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]

scores = cross_val_score(logreg, X, y, groups=groups, cv=GroupKFold(n_splits=3))
print("Cross-validation score:\n", scores)

Cross-validation score:
 [0.75       0.6        0.66666667]


## Cross Validation w/ Evaluation Metrix

In [33]:
df = pd.read_excel("data/crowdfunding_ex.xlsx")

x_df =  df[["Backers", "CountryCode", "VideoCount", "ImageCount", "TagCode", "Goal", "Period", "SNS", 
               "Fiends" , "#ofCreation", "보상수"]]
x_df.rename(columns={'보상수': '#ofRewards'}, inplace=True)
y = df['Funded'] > df['Goal']

lr = LogisticRegression(max_iter=1000)

scores = cross_val_score(lr, x_df, y, scoring="accuracy", cv=10)
print("Default Cross-validation score:", cross_val_score(lr, x_df, y, cv=10))
print("Accuracy score:", cross_val_score(lr, x_df, y, scoring="accuracy", cv=10))
print("Roc_Auc score:", cross_val_score(lr, x_df, y, scoring="roc_auc", cv=10))
print("Precision score:", cross_val_score(lr, x_df, y, scoring="precision", cv=10))
print("Recall score:", cross_val_score(lr, x_df, y, scoring="recall", cv=10))
print("f1 score:", cross_val_score(lr, x_df, y, scoring="f1", cv=10))

Default Cross-validation score: [0.90909091 0.79220779 0.93506494 0.84415584 0.88311688 0.90909091
 0.85714286 0.84210526 0.89473684 0.84210526]
Accuracy score: [0.90909091 0.79220779 0.93506494 0.84415584 0.88311688 0.90909091
 0.85714286 0.84210526 0.89473684 0.84210526]
Roc_Auc score: [0.90221088 0.83248299 0.97193878 0.91570248 0.9214876  0.9661157
 0.94628099 0.90649351 0.94199134 0.87705628]
Precision score: [0.9375     0.72727273 1.         0.91666667 0.88235294 1.
 0.92307692 0.69565217 0.88235294 0.76470588]
Recall score: [0.71428571 0.38095238 0.76190476 0.5        0.68181818 0.68181818
 0.54545455 0.76190476 0.71428571 0.61904762]
f1 score: [0.81081081 0.5        0.86486486 0.64705882 0.76923077 0.81081081
 0.68571429 0.72727273 0.78947368 0.68421053]


In [34]:
res = cross_validate(lr, x_df, y, scoring=['accuracy', 'roc_auc', 'recall_macro', "recall_weighted", 
                                           "precision_macro", "precision_weighted","precision", "recall", "f1"], cv=10)
pd.DataFrame(res)

Unnamed: 0,fit_time,score_time,test_accuracy,test_roc_auc,test_recall_macro,test_recall_weighted,test_precision_macro,test_precision_weighted,test_precision,test_recall,test_f1
0,0.030595,0.016257,0.909091,0.902211,0.848214,0.909091,0.91957,0.91142,0.9375,0.714286,0.810811
1,0.153083,0.027652,0.792208,0.832483,0.66369,0.792208,0.765152,0.782369,0.727273,0.380952,0.5
2,0.077499,0.029273,0.935065,0.971939,0.880952,0.935065,0.959016,0.940387,1.0,0.761905,0.864865
3,0.028739,0.016016,0.844156,0.915702,0.740909,0.844156,0.873718,0.855311,0.916667,0.5,0.647059
4,0.020002,0.010505,0.883117,0.921488,0.822727,0.883117,0.882843,0.883053,0.882353,0.681818,0.769231
5,0.017073,0.009981,0.909091,0.966116,0.840909,0.909091,0.943548,0.919355,1.0,0.681818,0.810811
6,0.023023,0.008995,0.857143,0.946281,0.763636,0.857143,0.883413,0.866415,0.923077,0.545455,0.685714
7,0.008988,0.008994,0.842105,0.906494,0.817316,0.842105,0.800656,0.847632,0.695652,0.761905,0.727273
8,0.01501,0.010532,0.894737,0.941991,0.838961,0.894737,0.890329,0.893897,0.882353,0.714286,0.789474
9,0.01849,0.010975,0.842105,0.877056,0.77316,0.842105,0.814556,0.836858,0.764706,0.619048,0.684211


In [35]:
X, y = make_blobs(random_state=0)
logreg = LogisticRegression(max_iter=1000)

res = cross_validate(logreg, X, y, scoring=['accuracy', 'roc_auc', 'recall_macro', "recall_weighted", 
                                           "precision_macro", "precision_weighted","precision", "recall", "f1"])
pd.DataFrame(res)

Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "c:\ProgramData\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 367, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported

Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "c:\ProgramData\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\ProgramData\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1954, in precision_score
    p, _, _, _ = precision_recall_fscore_support(
  File "c:\ProgramData\anaconda3\lib

Unnamed: 0,fit_time,score_time,test_accuracy,test_roc_auc,test_recall_macro,test_recall_weighted,test_precision_macro,test_precision_weighted,test_precision,test_recall,test_f1
0,0.005586,0.012564,0.9,,0.904762,0.9,0.916667,0.925,,,
1,0.005015,0.007995,0.9,,0.904762,0.9,0.916667,0.925,,,
2,0.005505,0.00701,0.9,,0.904762,0.9,0.916667,0.925,,,
3,0.004992,0.006998,0.8,,0.801587,0.8,0.796429,0.800536,,,
4,0.004002,0.006998,0.95,,0.944444,0.95,0.958333,0.95625,,,


In [36]:
y

array([1, 0, 1, 0, 0, 0, 2, 2, 1, 0, 0, 0, 1, 0, 2, 1, 2, 0, 2, 2, 2, 2,
       2, 0, 1, 1, 1, 1, 2, 2, 0, 1, 1, 0, 2, 2, 0, 1, 1, 2, 2, 1, 1, 0,
       0, 0, 1, 1, 2, 2, 2, 1, 0, 1, 2, 2, 1, 1, 0, 1, 1, 2, 2, 2, 2, 1,
       0, 2, 1, 0, 2, 0, 0, 1, 1, 0, 0, 0, 2, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 2, 2, 2, 2, 0, 0, 2, 2])