In [1]:
import os
import os.path as osp
import numpy as np
import pandas as pd
from tqdm import tqdm
import json

import Bio
from Bio import SeqIO
from Bio import AlignIO
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

In [2]:
def load_lib(jsonlf):
    data = {
        'accession': [],
        'length': [],
        'location': [],
        'releaseDate': [],
        'pstart': [],
        'pend': [],
        'variant': [],
    }
    for line in tqdm(jsonlf, position=0, leave=True, desc='reading lib file'):
        info = json.loads(line)
        if (info['completeness']=='COMPLETE') and ('location' in info.keys()) and ('annotation' in info.keys()) and ('virus' in info.keys()):
            try:
                loc = info['location']['geographicLocation']
                var = info['virus']['pangolinClassification']
                if loc.find('USA') != -1:
                    annlist = info['annotation']['genes']
                    for p in annlist:
                        if p['name']=='S':
                            prange = p['cds'][0]['nucleotide']['range'][0]
                            data['pstart'].append(int(prange['begin']))
                            data['pend'].append(int(prange['end']))
                            data['location'].append(loc)
                            data['accession'].append(info['accession'])
                            data['length'].append(info['length'])
                            data['releaseDate'].append(info['releaseDate'])
                            data['variant'].append(var)
                            break
            except:
                continue
    libdf = pd.DataFrame(data=data)
    libdf['plen'] = libdf['pend']-libdf['pstart']
    libdf = libdf.sort_values(by=['releaseDate']).dropna()
    
    return libdf


def load_seq(fastapath, mode):
    sequences = {}
    for seq_record in tqdm(SeqIO.parse(fastapath,mode), position=0, leave=True, desc='reading fasta file'):
        sequences[seq_record.id] = seq_record.seq.replace('N','')
    
    return sequences

In [3]:
all_genomic_path = './../ncbi_dataset/data/genomic.fna'
complete_genomic_path = './../ncbi_dataset/data/complete_genome.fasta'
protein_path = './../ncbi_dataset/data/protein.faa'
libpath = './../ncbi_dataset/data/data_report.jsonl'

In [4]:
%%time
jsonlf = open(libpath,'r').readlines()

reading lib file: 100%|██████████| 4542920/4542920 [08:16<00:00, 9147.72it/s] 


CPU times: user 8min 37s, sys: 1min 7s, total: 9min 45s
Wall time: 14min 35s


In [None]:
%%time
lib = load_lib(jsonlf)

In [6]:
%%time
sequences = load_seq(genomic_path, 'fasta')

reading fasta file: 4542920it [22:43, 3330.63it/s]

CPU times: user 10min 38s, sys: 3min 7s, total: 13min 46s
Wall time: 22min 47s





In [None]:
print(lib.shape)

In [None]:
allvar = lib.variant.unique().tolist()
alpha = lib[lib.variant=='B.1.1.7']
print('alpha', alpha.shape)
beta = lib[lib.variant=='B.1.351']
print('beta', beta.shape)
gamma = lib[lib.variant=='P.1']
print('gamma', gamma.shape)
delta = lib[lib.variant.isin(['B.1.617.2','XD', 'XF', 'XS'])]
print('delta', delta.shape)
omicron = lib[lib.variant.isin(['B.1.1.529','BA.1','BA.2','BA.3','BA.4','BA.5'])]
print('omicron', omicron.shape)

In [None]:
data1 = beta.head(1)
data2 = beta.tail(1)
seq1 = str(sequences[data1.accession.item()])[data1.pstart.item():data1.pend.item()]
seq2 = str(sequences[data2.accession.item()])[data2.pstart.item():data2.pend.item()]

In [None]:
%%time
alignment = pairwise2.align.globalms(seq1, seq2, 2, -1, -10, -0.5, one_alignment_only=True)

In [None]:
with open('test.txt','w') as f:
    f.write(alignment[0].seqA)
    f.write('\n')
    f.write(alignment[0].seqB)