In [1]:
import os
import os.path as osp
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import json
import copy

import Bio
from Bio import SeqIO
from Bio import AlignIO
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio.Align import substitution_matrices

import matplotlib.pyplot as plt

In [15]:
def load_lib(jsonlf):
    data = {
        'accession': [],
        'length': [],
        'location': [],
        'releaseDate': [],
    }
    for line in tqdm(jsonlf, position=0, leave=True, desc='reading lib file'):
        info = json.loads(line)
        if info['completeness']=='COMPLETE' and info['length']>27000:
            try:
                data['location'].append(info['location']['geographicLocation'])
            except:
                data['location'].append('Unknown')
            data['accession'].append(info['accession'])
            data['length'].append(info['length'])
            data['releaseDate'].append(info['releaseDate'])
    libdf = pd.DataFrame(data=data)
    libdf = libdf.sort_values(by=['releaseDate']).dropna()
    libdf = libdf.sort_values(by=['length'],ascending=False).dropna()
    
    return libdf


def load_seq(fastapath, mode):
    sequences = {}
    for seq_record in tqdm(SeqIO.parse(fastapath,mode), position=0, leave=True, desc='reading fasta file'):
        sequences[seq_record.id] = seq_record.seq.replace('N','')
    
    return sequences

In [3]:
genomic_path = './../ncbi_dataset/data/genomic.fna'
report_path = './../ncbi_dataset/data/data_report.jsonl'

In [12]:
%%time
jsonlf = open(report_path,'r').readlines()
lib = load_lib(jsonlf)
acclist = lib.accession.tolist()

reading lib file: 100%|██████████| 4542920/4542920 [08:47<00:00, 8613.62it/s]


CPU times: user 8min 57s, sys: 1min 35s, total: 10min 33s
Wall time: 15min 7s


In [16]:
%%time
sequences = load_seq(genomic_path, 'fasta')

reading fasta file: 4542920it [22:01, 3437.22it/s]

CPU times: user 10min 36s, sys: 2min 41s, total: 13min 17s
Wall time: 22min 1s





In [17]:
print(lib.shape)
print(len(list(sequences.keys())))

(1001653, 4)
4542920


In [18]:
def write_lib_processed(proclib_path, lib):
    lib.to_csv(proclib_path, index=False)

    
def write_seq_processed(procseq_path, acclist, sequences):
    with open(procseq_path, 'w') as f:
        for accid in tqdm(acclist, position=0, leave=True, desc='writing seqs to file'):
            content = '>'+accid+'\n'+str(sequences[accid])+'\n'
            f.write(content)

In [19]:
procseq_path = './processed_dataset/processed_seq.fasta'
proclib_path = './processed_dataset/processed_lib.csv'

In [20]:
%%time
write_lib_processed(proclib_path, lib)

CPU times: user 2.37 s, sys: 103 ms, total: 2.48 s
Wall time: 2.92 s


In [22]:
%%time
write_seq_processed(procseq_path, acclist, sequences)

writing seqs to file: 100%|██████████| 1001653/1001653 [00:45<00:00, 22037.11it/s]

CPU times: user 12.8 s, sys: 31.2 s, total: 44.1 s
Wall time: 45.5 s



