### Breast Cancer Dataset

In [7]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer

ds = load_breast_cancer()

X = pd.DataFrame(ds.data)
y = ds.target

print('features shape:', X.shape) 
print('target shape:', y.shape )

features shape: (569, 30)
target shape: (569,)


### Bug Desciption
* I created a transformer StrandVectorizer() and a classifier StrandNonBinaryClassifier() 
* Both seem to work well and both function with the scikit tools like cross_validate() and GridSearchCV(). 
* However, I am getting inconsistent cross validation results when I place them both into a pipeline and run through cross_validate()

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
import time

def stratified_cross_validate(model, X, y, cv, n_jobs=-1):
    start = time.time()
    cv_results = cross_validate(model, X, y, cv=cv, scoring="accuracy", n_jobs = n_jobs)
    elapsed_time = (time.time() - start) 
    print ('Fold Scores:')
    print(' ')
    print(cv_results['test_score'])
    print(' ')
    print('Mean Accuracy: ', cv_results['test_score'].mean())
    print('Mean Fit Time: ', cv_results['fit_time'].mean())
    print('Mean Score Time: ', cv_results['score_time'].mean())
    print('CV Time: ', elapsed_time)
    
    return

### 10 fold cross_validate() scores remain consitent when I adjust paramters and repeately perform cross validation in a loop

In [9]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

truncate_val = 2

for tv in range(10):
    print('--------------------------------------------------------------')
    print('truncate_to: ',tv)
    print('--------------------------------------------------------------')
    
    for i in range(10):
        sv = StrandVectorizer(truncate_to=tv)
        X_sv = sv.fit_transform(X)

        model = StrandNonBinaryClassifier(use_acf=False,use_cat_idf=False)
        stratified_cross_validate(model,X_sv,y,cv, -1)
        print('___________________________________________________________')

--------------------------------------------------------------
truncate_to:  0
--------------------------------------------------------------
Fold Scores:
 
[0.38596491 0.38596491 0.36842105 0.36842105 0.36842105 0.36842105
 0.36842105 0.36842105 0.36842105 0.375     ]
 
Mean Accuracy:  0.3725877192982455
Mean Fit Time:  0.037500643730163576
Mean Score Time:  0.007815003395080566
CV Time:  0.07813000679016113
___________________________________________________________
Fold Scores:
 
[0.38596491 0.38596491 0.36842105 0.36842105 0.36842105 0.36842105
 0.36842105 0.36842105 0.36842105 0.375     ]
 
Mean Accuracy:  0.3725877192982455
Mean Fit Time:  0.035346603393554686
Mean Score Time:  0.004841232299804687
CV Time:  0.0923318862915039
___________________________________________________________
Fold Scores:
 
[0.38596491 0.38596491 0.36842105 0.36842105 0.36842105 0.36842105
 0.36842105 0.36842105 0.36842105 0.375     ]
 
Mean Accuracy:  0.3725877192982455
Mean Fit Time:  0.03748197555541

Fold Scores:
 
[0.96491228 0.96491228 0.98245614 0.87719298 0.87719298 0.9122807
 0.92982456 0.84210526 0.98245614 0.96428571]
 
Mean Accuracy:  0.9297619047619048
Mean Fit Time:  0.04050431251525879
Mean Score Time:  0.005402970314025879
CV Time:  0.0899958610534668
___________________________________________________________
Fold Scores:
 
[0.96491228 0.96491228 0.98245614 0.87719298 0.87719298 0.9122807
 0.92982456 0.84210526 0.98245614 0.96428571]
 
Mean Accuracy:  0.9297619047619048
Mean Fit Time:  0.03393852710723877
Mean Score Time:  0.011318397521972657
CV Time:  0.08330845832824707
___________________________________________________________
Fold Scores:
 
[0.96491228 0.96491228 0.98245614 0.87719298 0.87719298 0.9122807
 0.92982456 0.84210526 0.98245614 0.96428571]
 
Mean Accuracy:  0.9297619047619048
Mean Fit Time:  0.037423133850097656
Mean Score Time:  0.006162905693054199
CV Time:  0.08674979209899902
___________________________________________________________
Fold Scores:


Fold Scores:
 
[0.80701754 0.8245614  0.71929825 0.77192982 0.71929825 0.73684211
 0.8245614  0.75438596 0.75438596 0.80357143]
 
Mean Accuracy:  0.7715852130325815
Mean Fit Time:  0.04243748188018799
Mean Score Time:  0.0034007787704467773
CV Time:  0.0956888198852539
___________________________________________________________
--------------------------------------------------------------
truncate_to:  5
--------------------------------------------------------------
Fold Scores:
 
[0.75438596 0.78947368 0.77192982 0.70175439 0.68421053 0.66666667
 0.75438596 0.71929825 0.75438596 0.71428571]
 
Mean Accuracy:  0.731077694235589
Mean Fit Time:  0.04279663562774658
Mean Score Time:  0.003500485420227051
CV Time:  0.09698987007141113
___________________________________________________________
Fold Scores:
 
[0.75438596 0.78947368 0.77192982 0.70175439 0.68421053 0.66666667
 0.75438596 0.71929825 0.75438596 0.71428571]
 
Mean Accuracy:  0.731077694235589
Mean Fit Time:  0.03867723941802979

Fold Scores:
 
[0.75438596 0.84210526 0.75438596 0.71929825 0.66666667 0.63157895
 0.73684211 0.70175439 0.73684211 0.71428571]
 
Mean Accuracy:  0.7258145363408521
Mean Fit Time:  0.03437771797180176
Mean Score Time:  0.0031248092651367187
CV Time:  0.07810664176940918
___________________________________________________________
Fold Scores:
 
[0.75438596 0.84210526 0.75438596 0.71929825 0.66666667 0.63157895
 0.73684211 0.70175439 0.73684211 0.71428571]
 
Mean Accuracy:  0.7258145363408521
Mean Fit Time:  0.04248547554016113
Mean Score Time:  0.0005735874176025391
CV Time:  0.1018381118774414
___________________________________________________________
Fold Scores:
 
[0.75438596 0.84210526 0.75438596 0.71929825 0.66666667 0.63157895
 0.73684211 0.70175439 0.73684211 0.71428571]
 
Mean Accuracy:  0.7258145363408521
Mean Fit Time:  0.03788788318634033
Mean Score Time:  0.0053436517715454105
CV Time:  0.0846869945526123
___________________________________________________________
Fold Scor

 
[0.75438596 0.8245614  0.77192982 0.70175439 0.68421053 0.59649123
 0.73684211 0.70175439 0.75438596 0.73214286]
 
Mean Accuracy:  0.7258458646616541
Mean Fit Time:  0.046161770820617676
Mean Score Time:  0.0032005071640014648
CV Time:  0.08885550498962402
___________________________________________________________
Fold Scores:
 
[0.75438596 0.8245614  0.77192982 0.70175439 0.68421053 0.59649123
 0.73684211 0.70175439 0.75438596 0.73214286]
 
Mean Accuracy:  0.7258458646616541
Mean Fit Time:  0.036836886405944826
Mean Score Time:  0.003969812393188476
CV Time:  0.08888459205627441
___________________________________________________________


### cross_validate() scores **ARE NOT** consistent when used with a pipeline

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from StrandVectorizer import StrandVectorizer
from StrandNonBinaryClassifier import StrandNonBinaryClassifier

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

pipe = Pipeline([
    ('SV', StrandVectorizer()),
    ('Strand', StrandNonBinaryClassifier())
    
])

stratified_cross_validate(pipe,X,y,cv,1)

Fold Scores:
 
[0.9122807  0.84210526 0.77192982 0.80701754 0.80701754 0.80701754
 0.89473684 0.84210526 0.8245614  0.83928571]
 
Mean Accuracy:  0.8348057644110275
Mean Fit Time:  0.11355674266815186
Mean Score Time:  0.03130602836608887
CV Time:  1.4858856201171875


In [15]:
stratified_cross_validate(pipe,X,y,cv,-1)

Fold Scores:
 
[0.87719298 0.80701754 0.78947368 0.77192982 0.84210526 0.75438596
 0.89473684 0.85964912 0.85964912 0.85714286]
 
Mean Accuracy:  0.8313283208020051
Mean Fit Time:  0.25518944263458254
Mean Score Time:  0.04558382034301758
CV Time:  2.5811712741851807


### I suspect the problem is with the classifier, since this does not happen when I replace the classifier with MultinomialNB() 

In [58]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from StrandVectorizer import StrandVectorizer
from sklearn.naive_bayes import MultinomialNB

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

pipe = Pipeline([
    #('SV', StrandVectorizer(binary=True, norm=None, use_idf=False, smooth_idf=False)),
    ('SV', StrandVectorizer()),
    #('Strand', StrandNonBinaryClassifier())
    ('Strand', MultinomialNB())
    
])

stratified_cross_validate(pipe,X,y,cv,-1)

Fold Scores:
 
[0.75438596 0.66666667 0.66666667 0.66666667 0.68421053 0.64912281
 0.73684211 0.71929825 0.63157895 0.69642857]
 
Mean Accuracy:  0.68718671679198
Mean Fit Time:  0.20259494781494142
Mean Score Time:  0.0390531063079834
CV Time:  2.3612422943115234


In [59]:
stratified_cross_validate(pipe,X,y,cv,-1)

Fold Scores:
 
[0.75438596 0.66666667 0.66666667 0.66666667 0.68421053 0.64912281
 0.73684211 0.71929825 0.63157895 0.69642857]
 
Mean Accuracy:  0.68718671679198
Mean Fit Time:  0.15931181907653807
Mean Score Time:  0.03749966621398926
CV Time:  0.3480038642883301


### I even tried to copy the behavior of the pipeline and fire fit_transform(), fit(), transform(), predict() for further testing.  
* In this case, the the accuracy scores remain consistent.  
* The only place they break is in the Pipeline()

In [37]:
from sklearn.metrics import accuracy_score

def test_cv():
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    cv_splits = cv.split(X, y)

    scores = []
    for i in range(10):
        train, test = next(cv_splits)
        X_train = pd.DataFrame(ds.data[train])
        y_train = y[train]
        X_test = pd.DataFrame(ds.data[test])
        y_test = y[test]

        sv = StrandVectorizer()
        strand = StrandNonBinaryClassifier()

        Xsv_train = sv.fit_transform(X_train) 
        strand.fit(Xsv_train, y_train)

        Xsv_test = sv.transform(X_test)
        y_predict = strand.predict(Xsv_test)

        acc = accuracy_score(y_test, y_predict)
        scores.append(acc)
        print(acc)    

    print('Mean Accuracy: ', np.mean(scores))

In [41]:
for i in range(10):
    test_cv()

0.9122807017543859
0.8245614035087719
0.8070175438596491
0.7894736842105263
0.8070175438596491
0.7894736842105263
0.9122807017543859
0.8421052631578947
0.8596491228070176
0.8392857142857143
Mean Accuracy:  0.838314536340852
0.9122807017543859
0.8245614035087719
0.8070175438596491
0.7894736842105263
0.8070175438596491
0.7894736842105263
0.9122807017543859
0.8421052631578947
0.8596491228070176
0.8392857142857143
Mean Accuracy:  0.838314536340852
0.9122807017543859
0.8245614035087719
0.8070175438596491
0.7894736842105263
0.8070175438596491
0.7894736842105263
0.9122807017543859
0.8421052631578947
0.8596491228070176
0.8392857142857143
Mean Accuracy:  0.838314536340852
0.9122807017543859
0.8245614035087719
0.8070175438596491
0.7894736842105263
0.8070175438596491
0.7894736842105263
0.9122807017543859
0.8421052631578947
0.8596491228070176
0.8392857142857143
Mean Accuracy:  0.838314536340852
0.9122807017543859
0.8245614035087719
0.8070175438596491
0.7894736842105263
0.8070175438596491
0.7894736

### Classifier scores also appear consistent during repeated runs of GridSearchCV()

In [14]:
from StrandVectorizer import StrandVectorizer
sv = StrandVectorizer(truncate_to=2)
X_sv = sv.fit_transform(X)
X_sv

<569x1953 sparse matrix of type '<class 'numpy.float64'>'
	with 17070 stored elements in Compressed Sparse Row format>

In [19]:
from sklearn.model_selection import GridSearchCV
from StrandNonBinaryClassifier import StrandNonBinaryClassifier
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

model = StrandNonBinaryClassifier()

param_grid = [
    {
        'use_cat_idf': [True, False],
        'use_acf': [True, False],
        'train_in_binary': [True, False]
    }] 

for i in range(10):
    # Perform a grid search truncating all numeric features at the same precision 
    grid_search = GridSearchCV(model, n_jobs=-1, param_grid=param_grid, cv=cv, scoring='accuracy')
    gs = grid_search.fit(X_sv, y)


    print('Top Grid Search Results\n')
    data = {'params': gs.cv_results_['params'], 'mean_test_score': gs.cv_results_['mean_test_score'] }
    df = pd.DataFrame(data)
    print(df.sort_values(['mean_test_score'],ascending=False))
    print('-----------------------------------------------------------------')

Top Grid Search Results

                                              params  mean_test_score
2  {'train_in_binary': True, 'use_acf': False, 'u...         0.929762
3  {'train_in_binary': True, 'use_acf': False, 'u...         0.929762
6  {'train_in_binary': False, 'use_acf': False, '...         0.929762
7  {'train_in_binary': False, 'use_acf': False, '...         0.929762
0  {'train_in_binary': True, 'use_acf': True, 'us...         0.929731
4  {'train_in_binary': False, 'use_acf': True, 'u...         0.908741
1  {'train_in_binary': True, 'use_acf': True, 'us...         0.840194
5  {'train_in_binary': False, 'use_acf': True, 'u...         0.506203
-----------------------------------------------------------------
Top Grid Search Results

                                              params  mean_test_score
2  {'train_in_binary': True, 'use_acf': False, 'u...         0.929762
3  {'train_in_binary': True, 'use_acf': False, 'u...         0.929762
6  {'train_in_binary': False, 'use_acf': Fal