In [None]:
import random
import pandas as pd

from Bio import SeqIO

from transformers import PreTrainedTokenizerFast




LETTER_TO_BASES = {
    "A": "A",
    "B": "CGT",
    "C": "C",
    "D": "AGT",
    "G": "G",
    "H": "ACT",
    "K": "GT",
    "M": "AC",
    "N": "ACGT",
    "R": "AG",
    "S": "CG",
    "T": "T",
    "V": "ACG",
    "W": "AT",
    "Y": "CT",
}

ROOT = 'e:/PlasmidAI' # '/scratch/adibvafa/plasmid-ai/'
DATA_ROOT = f'{ROOT}/data'
DATA_SPLITS = f'{DATA_ROOT}/splits.csv'
BASE_FILE = 'plasmids_replicon'
DATASET = f'{DATA_ROOT}/{BASE_FILE}.fasta'
DATASET_TXT = f'{DATA_ROOT}/{BASE_FILE}.txt'
DATASET_CUTOFF = f'{DATA_ROOT}/{BASE_FILE}_cutoff.txt'
DATASET_DUMMY =f'{DATA_ROOT}/{BASE_FILE}_dummy.txt'
DATASET_FINETUNE = f'{DATA_ROOT}/{BASE_FILE}_finetune.txt'
DATASET_CUTOFF_RC = f'{DATA_ROOT}/{BASE_FILE}_cutoff_rc.txt'
TOKENIZER = 'dna_bpe_tokenizer_offset.json'

SEED = 42
LEN_CUTOFF = 100_000
VOCAB_SIZE = 4096
NUM_SEQUENCES = 10     #54646
MAX_TOKEN_LENGTH = 32
SPECIAL_TOKENS = ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

random.seed(SEED)

In [None]:
def read_fasta_to_dataframe(file_path):
    sequences = []
    for record in SeqIO.parse(file_path, "fasta"):
        sequences.append([record.id, str(record.seq), record.description])
    
    dataset = pd.DataFrame(sequences, columns=['ID', 'Sequence', 'Description'])
    return dataset


def preprocess_dna_seqeunce(seq):
    # Clean all whitespaces
    seq = seq.upper()
    cleaned_seq = seq.replace(' ', '').replace('\n', '').replace('\r', '').replace('\t', '')
    
    # Replace each letter with a random base from LETTER_TO_BASES using random.randint
    replaced_seq = ''
    for letter in cleaned_seq:
        bases = LETTER_TO_BASES[letter]
        random_base = bases[random.randint(0, len(bases) - 1)]
        replaced_seq += random_base
    
    return replaced_seq


def reverse_compliment(dna):
    return dna.translate(str.maketrans("ATCG", "TAGC"))[::-1]

In [None]:
# Read plasmid dataset and splits
dataset = read_fasta_to_dataframe(DATASET)
# dataset['Sequence'] = dataset['Sequence'].apply(preprocess_dna_seqeunce)

# Select the finetune dataset
data_splits = pd.read_csv(DATA_SPLITS)
finetune_split = data_splits[(data_splits['split'] == 'train') & (data_splits['finetune'] == 1)]
finetune_dataset = dataset[dataset['ID'].isin(finetune_split['id'])]

# Select the cutoff dataset, whose length is less than LEN_CUTOFF
cutoff_dataset = dataset[dataset['Sequence'].apply(lambda x: len(x) < LEN_CUTOFF)]

# Add reverse compliment to the dataset
cutoff_dataset['Sequence'] = cutoff_dataset['Sequence'].apply(preprocess_dna_seqeunce)
reverse_compliment_dataset = cutoff_dataset.copy()
reverse_compliment_dataset['Sequence'] = reverse_compliment_dataset['Sequence'].apply(reverse_compliment)
cutoff_dataset_with_reverse_compliment = pd.concat([cutoff_dataset, reverse_compliment_dataset], axis=0)

# Save dataset to txt for tokenizer
dataset['Sequence'].to_csv(DATASET_TXT, index=False, header=False)
# cutoff_dataset['Sequence'].to_csv(DATASET_CUTOFF, index=False, header=False)
# finetune_dataset['Sequence'].to_csv(DATASET_FINETUNE, index=False, header=False)
# dataset['Sequence'].iloc[:NUM_SEQUENCES].to_csv(DATASET_DUMMY, index=False, header=False)
cutoff_dataset_with_reverse_compliment['Sequence'].to_csv(DATASET_CUTOFF_RC, index=False, header=False)

In [None]:
# Load the SentencePiece tokenizer
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=TOKENIZER
)

In [None]:
# Test the tokenizer on a sample DNA sequence
sequence = "ATTCTGCGGTTCCCCCTGGAAGACCTACGCAAGTTGGGCCAGCTCAGAGGTGGAATCAACGAAGGCGAGC"
encoded = tokenizer(sequence)
print("Encoded sequence:", encoded)

# Decode the tokens back to the original sequence
decoded_sequence = tokenizer.decode(encoded['input_ids'])
print("Decoded sequence:", decoded_sequence.upper())

# Anlyze the vocabulary
print(f'Alphabete: {set(''.join(list(sorted(tokenizer.vocab.keys())[::-1])[5:]))}')

In [None]:
sorted(tokenizer.vocab.keys())