## Load data and predictions

In [4]:
import json

with open('data/data.json', 'r') as file:
    data = json.load(file)
    
merged_data = [{
    'labels': ''.join([p['labels'] for p in data]),
    'jpred4': ''.join([p['jpred4'] for p in data]),
    'preds': ''.join([p['preds'] for p in data])
}]

## Performance statistics

In [14]:
import numpy as np


def accuracy(protein_dict):
    labels = protein_dict['labels']
    preds = protein_dict['preds']
    jpred4 = protein_dict['jpred4']
    
    preds_cnt = 0
    jpred4_cnt = 0
    
    for i, l in enumerate(labels):
        if preds[i] == l:
            preds_cnt += 1
        if jpred4[i] == l:
            jpred4_cnt += 1
    
    n = len(labels)
    if n == 0:
        return 1, 1
    return preds_cnt / n, jpred4_cnt / n


def sensitivity(protein_dict, cls):
    labels = protein_dict['labels']
    preds = protein_dict['preds']
    jpred4 = protein_dict['jpred4']
    
    preds_cnt = 0
    jpred4_cnt = 0
    
    for i, l in enumerate(labels):
        if l != cls:
            continue
        if preds[i] == l:
            preds_cnt += 1
        if jpred4[i] == l:
            jpred4_cnt += 1

    n = labels.count(cls)
    if n == 0:
        return 1, 1
    return preds_cnt / n, jpred4_cnt / n


def specificity(protein_dict, cls):
    labels = protein_dict['labels']
    preds = protein_dict['preds']
    jpred4 = protein_dict['jpred4']
    
    preds_n = 0
    jpred4_n = 0
    preds_cnt = 0
    jpred4_cnt = 0
    
    for i, l in enumerate(labels):
        if preds[i] == cls:
            preds_n += 1
        if jpred4[i] == cls:
            jpred4_n += 1
        if l == cls:
            if preds[i] == l:
                preds_cnt += 1
            if jpred4[i] == l:
                jpred4_cnt += 1

    preds_score = 1 if preds_n == 0 else preds_cnt / preds_n
    jpred4_score = 1 if jpred4_n == 0 else jpred4_cnt / jpred4_n
    
    return preds_score, jpred4_score


def average_accuracy(data):
    preds_accs = []
    jpred4_accs = []
    for protein_dict in data:
        preds_acc, jpred4_acc = accuracy(protein_dict)
        preds_accs.append(preds_acc)
        jpred4_accs.append(jpred4_acc)
    return np.mean(preds_accs), np.mean(jpred4_accs)


def average_metrics(data, cls=None):
    if cls is None:
        return average_accuracy(data)
    
    preds_sens_l = []
    preds_spec_l = []
    jpred4_sens_l = []
    jpred4_spec_l = []
    
    for protein_dict in data:
        preds_sens, jpred4_sens = sensitivity(protein_dict, cls)
        preds_sens_l.append(preds_sens)
        jpred4_sens_l.append(jpred4_sens)
        
        preds_spec, jpred4_spec = specificity(protein_dict, cls)
        preds_spec_l.append(preds_spec)
        jpred4_spec_l.append(jpred4_spec)
    return np.mean(preds_sens_l), np.mean(jpred4_sens_l), np.mean(preds_spec_l), np.mean(jpred4_spec_l)

In [17]:
# Statistics weighted by protein length
preds_acc, jpred4_acc = average_accuracy(merged_data)
preds_sens_a, jpred4_sens_a, preds_spec_a, jpred4_spec_a = average_metrics(merged_data, 'h')
preds_sens_b, jpred4_sens_b, preds_spec_b, jpred4_spec_b = average_metrics(merged_data, 'e')
preds_sens_c, jpred4_sens_c, preds_spec_c, jpred4_spec_c = average_metrics(merged_data, '_')

print("===== Statistics weighted by protein length =====")
print(f"Model accuracy: {'{0:.4f}'.format(preds_acc)}")
print(f"Model helix sensitivity: {'{0:.4f}'.format(preds_sens_a)}")
print(f"Model sheet sensitivity: {'{0:.4f}'.format(preds_sens_b)}")
print(f"Model coil sensitivity: {'{0:.4f}'.format(preds_sens_c)}")
print(f"Model helix specificity: {'{0:.4f}'.format(preds_spec_a)}")
print(f"Model sheet specificity: {'{0:.4f}'.format(preds_spec_b)}")
print(f"Model coil specificity: {'{0:.4f}'.format(preds_spec_c)}")
print(f"JPred4 accuracy: {'{0:.4f}'.format(jpred4_acc)}")
print(f"JPred4 helix sensitivity: {'{0:.4f}'.format(jpred4_sens_a)}")
print(f"JPred4 sheet sensitivity: {'{0:.4f}'.format(jpred4_sens_b)}")
print(f"JPred4 coil sensitivity: {'{0:.4f}'.format(jpred4_sens_c)}")
print(f"JPred4 helix specificity: {'{0:.4f}'.format(jpred4_spec_a)}")
print(f"JPred4 sheet specificity: {'{0:.4f}'.format(jpred4_spec_b)}")
print(f"JPred4 coil specificity: {'{0:.4f}'.format(jpred4_spec_c)}")


# Statistics not weighted by protein length
preds_acc, jpred4_acc = average_accuracy(data)
preds_sens_a, jpred4_sens_a, preds_spec_a, jpred4_spec_a = average_metrics(data, 'h')
preds_sens_b, jpred4_sens_b, preds_spec_b, jpred4_spec_b = average_metrics(data, 'e')
preds_sens_c, jpred4_sens_c, preds_spec_c, jpred4_spec_c = average_metrics(data, '_')

print("===== Statistics not weighted by protein length =====")
print(f"Model accuracy: {'{0:.4f}'.format(preds_acc)}")
print(f"Model helix sensitivity: {'{0:.4f}'.format(preds_sens_a)}")
print(f"Model sheet sensitivity: {'{0:.4f}'.format(preds_sens_b)}")
print(f"Model coil sensitivity: {'{0:.4f}'.format(preds_sens_c)}")
print(f"Model helix specificity: {'{0:.4f}'.format(preds_spec_a)}")
print(f"Model sheet specificity: {'{0:.4f}'.format(preds_spec_b)}")
print(f"Model coil specificity: {'{0:.4f}'.format(preds_spec_c)}")
print(f"JPred4 accuracy: {'{0:.4f}'.format(jpred4_acc)}")
print(f"JPred4 helix sensitivity: {'{0:.4f}'.format(jpred4_sens_a)}")
print(f"JPred4 sheet sensitivity: {'{0:.4f}'.format(jpred4_sens_b)}")
print(f"JPred4 coil sensitivity: {'{0:.4f}'.format(jpred4_sens_c)}")
print(f"JPred4 helix specificity: {'{0:.4f}'.format(jpred4_spec_a)}")
print(f"JPred4 sheet specificity: {'{0:.4f}'.format(jpred4_spec_b)}")
print(f"JPred4 coil specificity: {'{0:.4f}'.format(jpred4_spec_c)}")

===== Statistics weighted by protein length =====
Model accuracy: 0.5592
Model helix sensitivity: 0.4519
Model sheet sensitivity: 0.3910
Model coil sensitivity: 0.6715
Model helix specificity: 0.4548
Model sheet specificity: 0.3941
Model coil specificity: 0.6676
JPred4 accuracy: 0.7974
JPred4 helix sensitivity: 0.8242
JPred4 sheet sensitivity: 0.7188
JPred4 coil sensitivity: 0.8143
JPred4 helix specificity: 0.8311
JPred4 sheet specificity: 0.6895
JPred4 coil specificity: 0.8242
===== Statistics not weighted by protein length =====
Model accuracy: 0.5695
Model helix sensitivity: 0.5040
Model sheet sensitivity: 0.5254
Model coil sensitivity: 0.6857
Model helix specificity: 0.3997
Model sheet specificity: 0.3345
Model coil specificity: 0.6721
JPred4 accuracy: 0.8036
JPred4 helix sensitivity: 0.7621
JPred4 sheet sensitivity: 0.7759
JPred4 coil sensitivity: 0.8208
JPred4 helix specificity: 0.8110
JPred4 sheet specificity: 0.6854
JPred4 coil specificity: 0.8376
