<h2>PerQoDA</h2>
<h3>Dataset Quality Assessment with Permutation Testing</h3>


Load required libraries. You may need to install some packages. Please, see the README for instructions on the installation of weles. The other packages may be installed with pip.

In [2]:
import weles as ws
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.cm as cm

Prepare dataset (X1 - data, y1 - labels). Add your data here. See examples in folder Examples.

In [340]:
datasets = {
    "all": (X1, y1)
}

Select classifiers and metric. You may need to install some packages with pip.

In [375]:
clfs = {
        "KNN": KNeighborsClassifier(),
        "RF": RandomForestClassifier(),
        "AB": AdaBoostClassifier()
}

metrics = {
    "recall": sensitivity_score
}

Run the Weles and save true results

In [None]:
# protocol2=(shuffle, folds, random_state)
ev = ws.evaluation.Evaluator(datasets=datasets, protocol2=(False, 2, None)).process(clfs=clfs, verbose=1)

scores = ev.score(metrics=metrics)

Run the PerQoDA

In [None]:
nperm = 100 # number of permutations 
perc = [50, 25, 10, 5, 1] # percentage of labels
a = np.shape(ev.scores.mean(axis=2)[:, :, 0]) # true results

perm = np.zeros((nperm,len(perc),a[1]))
perc_true = np.zeros((nperm,len(perc)))

for i in range(nperm):
    for j, p in enumerate(self.perc):

        print(i,j)
           
        while True:
            indices = [
                np.where(self.y1 == k)[0] for k in np.unique(self.y1)
            ]

            ind_percentages = [
                round(p * len(index) / 100) for index in indices
            ]

            indP = np.random.permutation(
                np.concatenate(
                    [
                        np.random.permutation(ind)[:nperc]
                        for ind, nperc in zip(indices, ind_percentages)
                    ]
                )
            )

            ind = np.sort(indP)

            y1P = np.copy(self.y1)

            y1P[ind] = self.y1[indP]

            comparison = self.y1 == y1P
            
            if not comparison.all():
                break

        datasetsP = {
          "all": (X1, y1P)
        }

        evP = ws.evaluation.Evaluator(datasets=datasetsP,protocol2=(False, 2, None)).process(clfs=clfs, verbose=0)

        scores = evP.score(metrics=metrics)

        perm[i,j,:] = evP.scores.mean(axis=2)[:, :, 0]
        
        perc_true[i, j] = np.sum(y1P == self.y1) / len(self.y1)
        perc_true[i,j] = kk[0,1]

Visualize ML results and print the p-value table

In [None]:
pvalues = np.zeros((a[1],len(perc)))
colors = cm.rainbow(np.linspace(0, 1, a[1]))

# plot true values as diamonds
for i, c in zip(range(a[1]),colors):
    plt.scatter(1.1+i*0.01, ev.scores.mean(axis=2)[:, i, 0], s=100, color=c, marker='d')
    
plt.legend(("KNN","RF","AB"))

# plot lines for true values
for i, c in zip(range(a[1]),colors):
    plt.plot([0, 1.1+i*0.01], [ev.scores.mean(axis=2)[:, i, 0], ev.scores.mean(axis=2)[:, i, 0]], c=c, linestyle='dashed', alpha=0.5)

# plot permutations
colors = cm.rainbow(np.linspace(0, 1, a[1]))
for j in range(len(perc)):
    for i, c in zip(range(a[1]),colors):
        ind = np.where(perm[:,j,i]<ev.scores.mean(axis=2)[:, i, 0])
        plt.scatter((perc_true[ind,j]), perm[ind,j,i], color="none", edgecolor=c, alpha=0.3)
        
for j in range(len(perc)):
    for i, c in zip(range(a[1]),colors):
        ind = np.where(perm[:,j,i]>=ev.scores.mean(axis=2)[:, i, 0])
        plt.scatter((perc_true[ind,j]), perm[ind,j,i], color=c, edgecolor="black", alpha=1)
        pvalues[i,j] = ((len(ind[0])+1)*1.0)/(nperm+1);

plt.ylabel('Performance (Recall)', size=12)
plt.xlabel('Permutation Fraction True', size=12)       

plt.plot([0, 1.1], [perm.min(), perm.min()], color='red', linestyle='dashed', alpha=0.5)

plt.axis([-0.05, 1.2, 0, 1.1])

plt.show()

pv = pd.DataFrame(data=pvalues, index=["KNN","RF","AB"], columns=["50%", "25%", "10%", "5%", "1%"])

def significant(v):
    return "font-weight: bold; color: red" if v > 0.01 else None

pv.style.applymap(significant)

Calculate the slope

In [None]:
names = ["KNN","RF","AB"]
cor = []
per = []
slopes = []

for i, c in zip(range(a[1]),colors):
    for j in range(len(perc)):
        plt.scatter(np.mean(perc_true[:,j]), np.mean(perm[:,j,i]), color=c, alpha=1)
    
    cor = np.mean(perc_true[:,:], axis=0)
    per = np.mean(perm[:,:,i], axis=0)
    
    slope, intercept = np.polyfit(cor, per, 1)
    plt.plot(cor, slope*cor + intercept, color=c, linewidth=0.8)
    print(names[i], '=', slope)
    slopes = np.append(slopes, slope)

plt.legend(names)
plt.ylabel('Performance (mean)', size=12)
plt.xlabel('Permutation Fraction True', size=12) 

maxind = np.argmax(slopes)

print('Slope:', np.max(slopes), '-', names[maxind])