In [7]:
from Bio import SeqIO
import pandas as pd
import os

In [13]:
#Together, these two functions take a large fasta file and create strings of the nucleotide data contained 
#within into seperate files for each speciemen of malaria included in the file

#RUNNING THESE WILL CREATE FILES
def _helper(filename = "Plasmodium_falciparum_3D7_Genome.fasta"):
    #parses file with genome into dictionary format
    record_dict = SeqIO.to_dict(SeqIO.parse(filename, "fasta"))
    for key in record_dict.keys():
        yield record_dict[key].seq, key
def create_helperdata(CHUNCK_SIZE=512):
    for sequence, name in iter(_helper()):
    with open(f"{name}.txt", "w") as f:
        chuncks = len(sequence) // CHUNCK_SIZE
        for i in range(chuncks):
            indx = i*CHUNCK_SIZE
            chunck = sequence[indx:indx+CHUNCK_SIZE]
            f.write(f"{chunck}\n")

            

In [1]:
#Each file contains many instances of strings of size=CHUNK_SIZE (default 512), these functions pairs each chunk with it's neighbors
def create_onepairs(filename):
    with open(filename) as f:
        lines = [line.rstrip('\n') for line in f]
    lines = list(zip(lines[:-1], lines[1:]))
    return lines

def create_pairedData(folder_path = "data"):
    filenames = []
    for filename in os.listdir(folder_path):
        if '.txt' in filename:
            f = "data/" + filename
            filenames.append(f)
    pairs = []
    for file in filenames:
        pair = create_onepairs(file)
        pairs = pairs + pair
    return pairs

In [28]:
"""from tokenizers import Tokenizer, normalizers, pre_tokenizers
from tokenizers.models import WordLevel
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
from tokenizers.trainers import WordLevelTrainer
trainer = WordLevelTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
from tokenizers.pre_tokenizers import ByteLevel
tokenizer.pre_tokenizer = ByteLevel()
files = [f'data/Nucleotides.txt']
tokenizer.train(files, trainer)
output = tokenizer.encode("A GTCG CGCGCGTTTTAA")"""

In [2]:
#Custom tokenizer for the nucleotide data
class tokenizer():
    def __init__(self, vocab, special_tokens):
        #vocab is given as a list of all words 
        #but will turn into a dictionary with words as keys and integers as values

        #special_tokens are extra symbols that aren't standard words, 
        #but rather used to delimit or do something within the text

        self.vocab = special_tokens + vocab
        self.tokens = {}
        d = {}
        for i in range(len(self.vocab)):
            d[self.vocab[i]] = i
            self.tokens[i] = self.vocab[i]
        self.vocab = d

            

    def convert_ids_to_tokens(self, ids):
        #ids are a list of integers that represent the encoded words
        #this function will convert each integer into a word
        #Ex. ids = [0, 3, 4, 1] dict={0: 'word1', 1: 'word2', 2:'word3', 3:'word4', 4:'word5'}
        self.output_ids = []
        for oneid in ids:
            if oneid not in self.tokens.keys():
                self.input_ids.append(self.tokens['[UNK]'])
                continue
            self.output_ids.append(self.tokens[oneid])
        return self.output_ids

    def __call__(self, sequence):
        #sequence is a list of nucleotides
        #use tokenizer on a sequence of nucleotides to output the resulting integers that are mapped to each nucleotide
        self.input_ids = {'input_ids':[]}
        for nucleotide in sequence:
            if nucleotide not in self.vocab.keys():
                self.input_ids['input_ids'].append(self.vocab['[UNK]'])
                continue
            self.input_ids['input_ids'].append(self.vocab[nucleotide])
        return self.input_ids

        

In [45]:
t = tokenizer(vocab=['A', 'T', 'G', 'C'], 
          special_tokens=['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]'])