In [None]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", 
    model="Rostlab/prot_electra_generator_bfd",
    tokenizer="Rostlab/prot_electra_generator_bfd"
)

In [None]:
mask_filler("MAT[MASK]ALM")

[{'score': 0.34794163703918457,
  'token': 23,
  'token_str': 'C',
  'sequence': 'C'},
 {'score': 0.16164058446884155,
  'token': 10,
  'token_str': 'S',
  'sequence': 'S'},
 {'score': 0.06785508245229721, 'token': 5, 'token_str': 'L', 'sequence': 'L'},
 {'score': 0.06539598107337952,
  'token': 13,
  'token_str': 'R',
  'sequence': 'R'},
 {'score': 0.052951961755752563,
  'token': 7,
  'token_str': 'G',
  'sequence': 'G'}]

# Variant Mutation Processing

In [None]:
import pandas as pd
import math
from statistics import mean

In [None]:
def isNaN(num):
    return num != num

In [None]:
mutations = ['D614G', 'D950N', 'E484K', 'H655Y', 'H655Y', 'K417N', 'N501Y', 'P681H', 'Q493R']

# Common Variant Mutations Masking Testing

In [None]:
mutations_seqs = {}
[mutations_seqs.setdefault(name, []) for name in mutations]

mutations_seqs_results = {}
[mutations_seqs_results.setdefault(name, []) for name in mutations]

mutations_seqs_original = {}
[mutations_seqs_original.setdefault(name, []) for name in mutations]

[[], [], [], [], [], [], [], [], []]

In [None]:
line_count = 0

with open('validation-set.txt', 'r') as file:
    for line in file:
        line_count += 1
        print(line_count)
        for mutation in mutations:
            original = mutation[0]
            change = mutation[-1]
            position = int("".join(mutation[1:-1]))-1

            if position < len(line) and line[position] == original:
                mutations_seqs[mutation].append(line)
                sequence = line[:position] + '[MASK]' + line[position+1:]
                curr_results = mask_filler(sequence)

                curr_sequence_pred = 0
                curr_sequence_nochange = 0

                for i in range(0, len(curr_results)):
                    if curr_results[i]['token_str'] == change:
                        curr_sequence_pred = curr_results[i]['score']
                    elif curr_results[i]['token_str'] == original:
                        curr_sequence_nochange = curr_results[i]['score']
                    if curr_sequence_pred > 0 and curr_sequence_nochange > 0:
                        break
                    
                mutations_seqs_results[mutation].append(curr_sequence_pred)
                mutations_seqs_original[mutation].append(curr_sequence_nochange)

In [None]:
mutations_seqs_results_mean = {}
[mutations_seqs_results_mean.setdefault(name, 0) for name in mutations]

mutations_seqs_original_mean = {}
[mutations_seqs_original_mean.setdefault(name, 0) for name in mutations]

for mutation in mutations_seqs_original:
    curr_list = mutations_seqs_original[mutation]
    mutations_seqs_original_mean[mutation] = round(mean(curr_list), 2)

for mutation in mutations_seqs_results:
    curr_list = mutations_seqs_results[mutation]
    mutations_seqs_results_mean[mutation] = round(mean(curr_list), 2)

In [None]:
mutations_seqs_results_mean

{'D614G': 0.09,
 'D950N': 0,
 'E484K': 0,
 'H655Y': 0,
 'K417N': 0,
 'N501Y': 0,
 'P681H': 0,
 'Q493R': 0.08}

In [None]:
mutations_seqs_original_mean

{'D614G': 0,
 'D950N': 0,
 'E484K': 0,
 'H655Y': 0,
 'K417N': 0,
 'N501Y': 0,
 'P681H': 0,
 'Q493R': 0}

In [None]:
for mutation in mutations_seqs_results:
    print(len(mutations_seqs_results[mutation]))

68
2063
1799
3776
2012
1878
1708
2171


In [None]:
for mutation in mutations_seqs_results:
    print(mutation + ' : ' + str(max(mutations_seqs_results[mutation])))

D614G : 0.09357799589633942
D950N : 0
E484K : 0
H655Y : 0
K417N : 0
N501Y : 0
P681H : 0
Q493R : 0.07783561199903488
