In [1]:
import numpy as np

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn import metrics

from scipy.stats import zscore
from scipy.stats import norm
from statsmodels.stats.multitest import fdrcorrection

In [2]:
train_num = 500
test_num = 50
dim = 20

In [3]:
train_X = np.random.randn(train_num, dim)
train_y = np.random.randint(1, 1+2, (train_num,))
train_X.shape, train_y.shape

((500, 20), (500,))

In [4]:
test_X = np.random.randn(test_num, dim)
test_y = np.random.randint(1, 1+2, (test_num,))
test_X.shape, test_y.shape

((50, 20), (50,))

In [5]:
def mk_pipe(gamma='auto'):
    pipe = Pipeline([
        ('clf', SVC(kernel='rbf', gamma=gamma))
    ])
    return pipe

In [6]:
scores = []
for _ in range(1000):
    select = np.random.randint(1, 1+5, (train_num)) == 1
    X = train_X[select]
    y = train_y[select]
    
    pipe = mk_pipe()
    pipe.fit(X, y)
    pred_y = pipe.predict(test_X)
    acc = metrics.accuracy_score(y_true=test_y, y_pred=pred_y)
        
    scores.append(acc)

sorted(scores)[-1]

0.66

In [8]:
print('| num | acc | z-value | p-value | fdr-p-value |')
print('| ---- | ---- | ---- | ---- | ---- |')
for j in [5, 10, 20, 50, 100, 200, 500, 1000]:
    s = sorted(np.random.choice(scores, size=j, replace=False))
    z = zscore(s)
    p = [1 - norm.cdf(e) for e in z]
    pc = fdrcorrection(p)
    print(f'| {j: 5d} | {s[-1]: .4f} | {z[-1]: .4f} | {p[-1]: .4f} | {pc[1][-1]: .4f} |')

| num | acc | z-value | p-value | fdr-p-value |
| ---- | ---- | ---- | ---- | ---- |
|     5 |  0.5400 |  1.1282 |  0.1296 |  0.5854 |
|    10 |  0.6000 |  1.7096 |  0.0437 |  0.4367 |
|    20 |  0.6000 |  2.1842 |  0.0145 |  0.1447 |
|    50 |  0.6400 |  1.9727 |  0.0243 |  0.6066 |
|   100 |  0.6600 |  2.6816 |  0.0037 |  0.3664 |
|   200 |  0.6200 |  2.2338 |  0.0127 |  0.4437 |
|   500 |  0.6600 |  2.8135 |  0.0025 |  0.5993 |
|  1000 |  0.6600 |  2.8369 |  0.0023 |  0.5554 |
