# Create data

Import libraries

In [1]:
from typing import List, Tuple
from Bio import SeqIO
import random

# All amino acids
aa_list = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

Loading the protein names and sequences:

In [2]:
path_to_file = '../data/wt.fasta'

dataset = []
for seq_record in SeqIO.parse(open(path_to_file, mode='r'), 'fasta'):
    dataset.append((seq_record.name, str(seq_record.seq)))

Makes mutations:

In [9]:
def sample_mutations(dataset:list, num_samples:int) -> set:
    """
    Creating random mutations
    """
    # set for mutated sequences
    res = set()
    # total length of all the sequences 
    tot_len = sum([len(seq) for name, seq in dataset])
    
    print(f"Max number of different mutations: {tot_len * 19}")
    assert tot_len * 19 >= num_samples, "Can't create this many mutations for this dataset!"
    
    # creating num_samples random mutated sequences, saving them in res
    while len(res) < num_samples:
        name, seq = random.choice(dataset)
        pos = random.randint(0, len(seq)-1)
        new_aa = random.choice(aa_list)
        mut_seq = seq[:pos] + new_aa + seq[pos+1:]
        res.add((f"{name}|{seq[pos]}{pos}{new_aa}", mut_seq))
    return res

Saving results in FASTA file:

In [8]:
def dataset_to_fasta(ds:list, fasta_file:str) -> None:
    """
    Saving results in FASTA file
    """

    with open(fasta_file, "w") as f:
        for name, seq in ds:
            f.write(f">{name}\n{seq}\n")

In [5]:
mutations = sample_mutations(dataset, 1e7)
dataset_to_fasta(mutations, "../data/mut.fasta")

Max number of different mutations: 21716677


We cloned the [ESM](https://github.com/facebookresearch/esm) (Evolutionary Model Scaling) into our repository:

In [None]:
!git clone git@github.com:facebookresearch/esm.git

First we download the model:

In [None]:
!wget https://dl.fbaipublicfiles.com/fair-esm/models/esm1b_t33_650M_UR50S.pt

Then we computed the embedding in bulk from FASTA files of `wt.fasta` and `mut.fasta`.

In [None]:
python esm/scripts/extract.py esm1b_t33_650M_UR50S wt.fasta wt --repr_layers 33 --include mean
python esm/scripts/extract.py esm1b_t33_650M_UR50S mut.fasta mut --repr_layers 33 --include mean