In [11]:
from textwrap import wrap
import pysam
import os
import pandas as pd
from tqdm import tqdm

In [12]:
def kmers_stride1(seq, k=6):
    # splits a sequence into overlapping k-mers
    return [seq[i:i + k] for i in range(0, len(seq)-k+1)]  

In [13]:
CHUNK_LEN = 512
OVERLAP_BP = 128
MAX_SEQ_LENGTH = 5000
TEST_FRACTION = 0.1

In [14]:
workdir = '/lustre/groups/epigenereg01/workspace/projects/vale/'

In [15]:
input_fa = workdir + 'mlm/fasta/241_mammals.shuffled.fa'

input_fasta = pysam.FastaFile(input_fa)

input_seqs = pd.read_csv(input_fa + '.fai', sep='\t', header=None, usecols=[0])[0].squeeze().values

N_train = int(len(input_seqs)*(1-TEST_FRACTION))       
train_seqs, test_seqs = input_seqs[:N_train], input_seqs[N_train:]

In [16]:
def prepare_output(output_txt, input_seqs):
    with open(output_txt,'w') as fout: 
        for seq_name in tqdm(input_seqs):
            seq = input_fasta.fetch(seq_name).rstrip().replace('-','')
            seq = seq[:MAX_SEQ_LENGTH]
            for chunk_idx,start_idx in enumerate(range(0,len(seq),CHUNK_LEN-OVERLAP_BP)):
                chunk = seq[start_idx:start_idx+CHUNK_LEN]
                chunk_tokenized = ' '.join(kmers_stride1(seq))
                fout.write(chunk_tokenized + '\n')

In [17]:
output_dir = workdir + f'mlm/dnabert-3utr/data/chunk_{CHUNK_LEN}_overlap_{OVERLAP_BP}/'

os.makedirs(output_dir, exist_ok=True)

In [18]:
prepare_output(output_dir + 'train.txt', train_seqs)

  3%|▎         | 102144/3405342 [01:54<1:01:40, 892.72it/s]

KeyboardInterrupt



In [None]:
prepare_output(output_dir + 'test.txt', test_seqs)

In [None]:
print('done')