In [1]:
import os
import os.path as osp
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import json

import Bio
from Bio import SeqIO
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio.Align import substitution_matrices

In [12]:
def read_fasta(filepath, mode):
    sequences = {}
    for seq_record in tqdm(SeqIO.parse(filepath, mode),position=0,leave=True,desc='reading fasta file'):
        sequences[seq_record.id] = seq_record.seq
    return sequences

def read_lib_csv(filepath):
    libdf = pd.read_csv(filepath, low_memory=False)
    libdf = libdf[['Nucleotide Accession','Nucleotide Length','Geo Location','Collection Date']]
    libdf['Collection Date'] = pd.to_datetime(libdf['Collection Date'], errors='coerce').dt.strftime("%Y/%m/%d")
    libdf = libdf.sort_values(by=['Collection Date']).dropna()
    return libdf

def read_lib_jsonl(jsonlf):
    data = {
        'accession': [],
        'length': [],
        'location': [],
        'releaseDate': [],
    }
    for line in tqdm(jsonlf):
        info = json.loads(line)
        if info['completeness']=='COMPLETE':
            try:
                data['location'].append(info['location']['geographicLocation'])
            except:
                data['location'].append('Unknown')
            data['accession'].append(info['accession'])
            data['length'].append(info['length'])
#             data['location'].append(info['location'])
            data['releaseDate'].append(info['releaseDate'])
    libdf = pd.DataFrame(data=data)
#     libdf['releaseDate'] = pd.to_datetime(libdf['releaseDate'],errors='coerce').dt.strftime("%Y/%m/%d")
    libdf = libdf.sort_values(by=['releaseDate']).dropna()
    return libdf

In [3]:
%%time
fastapath = './../ncbi_dataset/data/genomic.fna'
sequences = read_fasta(fastapath,'fasta')

reading fasta file: 4542920it [21:24, 3537.21it/s]

CPU times: user 9min 40s, sys: 2min 16s, total: 11min 57s
Wall time: 21min 24s





In [4]:
%%time
libpath = './../ncbi_dataset/data/data_report.jsonl'
jsonlf = open(libpath, 'r').readlines()

CPU times: user 36.8 s, sys: 1min 8s, total: 1min 45s
Wall time: 9min 3s


In [13]:
lib = read_lib_jsonl(jsonlf)

100%|██████████| 4542920/4542920 [08:28<00:00, 8938.01it/s]


In [14]:
print(len(sequences))

4542920


In [15]:
print(lib.head())

          accession  length         location releaseDate
434948   MN908947.3   29903            China  2020-01-12
591334  NC_045512.2   29903            China  2020-01-13
662517   MN985325.1   29882              USA  2020-01-24
355160   MN938384.1   29838  China: Shenzhen  2020-01-24
9135     MN975262.1   29891            China  2020-01-24


In [20]:
from collections import Counter

print(Counter(lib['length']))
print(len(lib['releaseDate'].unique()))

Counter({29903: 80663, 29884: 77217, 29890: 61079, 29782: 28857, 29795: 22207, 29873: 21248, 29763: 20442, 29787: 18848, 29747: 18660, 29770: 18509, 29812: 18276, 29769: 16789, 29816: 15368, 29896: 13073, 29793: 12752, 29810: 11637, 29799: 10596, 29792: 10164, 29786: 9567, 29752: 9170, 29811: 9116, 29809: 8978, 29813: 8409, 29800: 7872, 29717: 7784, 29767: 6848, 29844: 6559, 29867: 6537, 29864: 6448, 29885: 6415, 29764: 6415, 29779: 6252, 29882: 6056, 29827: 6031, 29781: 5913, 29817: 5864, 29886: 5803, 29808: 5747, 29871: 5464, 29794: 5370, 29772: 5207, 29802: 5111, 29778: 4962, 29836: 4913, 29815: 4884, 29828: 4738, 29766: 4629, 29750: 4533, 29785: 4412, 29819: 4239, 29803: 4216, 29788: 4198, 29806: 4195, 29716: 4186, 29854: 4116, 29653: 4083, 29783: 4035, 29798: 4029, 29823: 4010, 29807: 3901, 29814: 3889, 29777: 3887, 29789: 3831, 29791: 3751, 29796: 3591, 29654: 3591, 29804: 3564, 29801: 3539, 29888: 3525, 29684: 3478, 29686: 3406, 29805: 3364, 29897: 3346, 29790: 3323, 29880: 3299

In [24]:
seq1 = sequences['MN908947.3']
seq2 = sequences['MN985325.1']

In [25]:
%%time
alignments = pairwise2.align.globalxx(seq1, seq2)

CPU times: user 1min 34s, sys: 22.2 s, total: 1min 57s
Wall time: 1min 57s


In [26]:
print(pairwise2.format_alignment(*alignments[0]))

ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCT