In [15]:
import os
import os.path as osp
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime

import Bio
from Bio import SeqIO
from Bio import pairwise2
from Bio.Align import substitution_matrices

In [2]:
def read_fasta(filepath, mode):
    sequences = {}
    for seq_record in tqdm(SeqIO.parse(filepath, mode),position=0,leave=True,desc='reading fasta file'):
        sequences[seq_record.id] = seq_record.seq
    return sequences

def read_lib(filepath):
    libdf = pd.read_csv(filepath, low_memory=False)
    libdf = libdf[['Nucleotide Accession','Nucleotide Length','Geo Location','Collection Date']]
    libdf['Collection Date'] = pd.to_datetime(libdf['Collection Date'], errors='coerce').dt.strftime("%Y/%m/%d")
    libdf = libdf.sort_values(by=['Collection Date']).dropna()
    return libdf

In [3]:
%%time
fastapath = './../datasets/ncbi_dataset/data/genomic.fna'
sequences = read_fasta(fastapath,'fasta')

reading fasta file: 943070it [08:59, 1749.10it/s]

Wall time: 8min 59s





In [4]:
%%time
libpath = './../datasets/ncbi_datasets.csv'
lib = read_lib(libpath)

Wall time: 15.1 s


In [5]:
print(len(sequences))

943070


In [6]:
print(lib.head())

       Nucleotide Accession  Nucleotide Length               Geo Location  \
0               NC_045512.2              29903                Asia; China   
255871           MT019529.1              29899  Asia; China: Hubei, Wuhan   
255878           MN996527.1              29825         Asia; China: Wuhan   
255877           MN996528.1              29891         Asia; China: Wuhan   
255876           MN996529.1              29852         Asia; China: Wuhan   

       Collection Date  
0           2019/12/01  
255871      2019/12/23  
255878      2019/12/30  
255877      2019/12/30  
255876      2019/12/30  


In [None]:
%%time
blosum62 = substitution_matrices.load("BLOSUM62")
alignment = pairwise2.align.globalds(sequences['NC_045512.2'],sequences['MT019529.1'],blosum62,-10,-0.5)

In [None]:
print(pairwise2.format_alignment(*alignment[0]))