In [1]:
import os
import os.path as osp
import numpy as np
import pandas as pd
from tqdm import tqdm
import json

import Bio
from Bio import SeqIO
from Bio import pairwise2
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [2]:
def load_lib(jsonlf):
    data = {
        'accession': [],
        'length': [],
        'location': [],
        'releaseDate': [],
    }
    for line in tqdm(jsonlf, position=0, leave=True, desc='reading lib file'):
        info = json.loads(line)
        if info['completeness']=='COMPLETE' and info['length']>27000:
            try:
                data['location'].append(info['location']['geographicLocation'])
            except:
                data['location'].append('Unknown')
            data['accession'].append(info['accession'])
            data['length'].append(info['length'])
            data['releaseDate'].append(info['releaseDate'])
    libdf = pd.DataFrame(data=data)
    libdf = libdf.sort_values(by=['releaseDate']).dropna()
    libdf = libdf.sort_values(by=['length'],ascending=False).dropna()
    
    return libdf


def load_seq(fastapath, mode):
    sequences = {}
    for seq_record in tqdm(SeqIO.parse(fastapath,mode), position=0, leave=True, desc='reading fasta file'):
        sequences[seq_record.id] = seq_record.seq.replace('N','')
    
    return sequences

In [3]:
genomic_path = './../ncbi_dataset/data/genomic.fna'
report_path = './../ncbi_dataset/data/data_report.jsonl'

In [4]:
%%time
jsonlf = open(report_path,'r').readlines()
lib = load_lib(jsonlf)

reading lib file: 100%|██████████| 4542920/4542920 [08:16<00:00, 9147.72it/s] 


CPU times: user 8min 37s, sys: 1min 7s, total: 9min 45s
Wall time: 14min 35s


In [6]:
%%time
sequences = load_seq(genomic_path, 'fasta')

reading fasta file: 4542920it [22:43, 3330.63it/s]

CPU times: user 10min 38s, sys: 3min 7s, total: 13min 46s
Wall time: 22min 47s





In [7]:
print(lib.shape)

acclist = lib.accession.tolist()
maxlen = lib.length.max()
print(maxlen)

maxlen_id = lib[lib.length==maxlen].accession.item()
maxlen_seq = str(sequences[maxlen_id])
print(maxlen_id)

print(maxlen_id in acclist)

(1001653, 4)
30119
MT844089.1
True


In [8]:
def write_lib_processed(proclib_path, lib):
    lib.to_csv(proclib_path, index=False)
    

def align_seq(acclist, maxlen_id, maxlen_seq, sequences):
    processed_seqs = [
        SeqRecord(Seq(maxlen_seq), id=maxlen_id)
    ]
    alignlist = acclist
    alignlist.remove(maxlen_id)
    for accid in tqdm(alignlist, position=0, leave=True, desc='align seqs'):
        seq2 = sequences[accid]
        alignments = pairwise2.align.globalms(maxlen_seq, seq2, 2,-1,-10,-0.5, one_alignment_only=True)
        seqB = alignments[0].seqB
        processed_seqs.append(SeqRecord(Seq(seqB), id=accid))
    
    return processed_seqs
    
def write_seq_processed(procseq_path, processed_seqs):
    SeqIO.write(processed_seqs, procseq_path, 'fasta')

In [9]:
procseq_path = './processed_dataset/processed_seq.fasta'
proclib_path = './processed_dataset/processed_lib.csv'

In [10]:
%%time
write_lib_processed(proclib_path, lib)

CPU times: user 1.98 s, sys: 73.5 ms, total: 2.06 s
Wall time: 2.11 s


In [None]:
%%time
processed_seqs = align_seq(acclist, maxlen_id, maxlen_seq, sequences)

align seqs:   0%|          | 23/1001652 [37:42<27007:16:09, 97.07s/it] 

In [22]:
%%time
write_seq_processed(procseq_path, acclist, sequences)

writing seqs to file: 100%|██████████| 1001653/1001653 [00:45<00:00, 22037.11it/s]

CPU times: user 12.8 s, sys: 31.2 s, total: 44.1 s
Wall time: 45.5 s



