In [5]:
import gzip, os, sys
import argparse
import pandas as pd
from Bio import SeqIO
from _include import log

In [4]:
ass_f_dir='features/'
ass_s_dir='sequences/'
delim=';'
match_str='16S ribosomal RNA.*'
len_min=500
len_max=2000 
overwrite=False

In [16]:

def add_processed_assembly_ids(
		assembly_df,
		fasta_filename,
		fasta_delim=';',
		overwrite=False
	):
	processed_assembly_ids = set([])
	last_assid = ''
	start_index = 0
	if os.path.isfile(fasta_filename) and not overwrite:
		log('* output '+fasta_filename+' already exists: ', end='')
		log('appending to file...', end='')
		with open(fasta_filename, 'r') as fasta_out:
			n_processed_ids = 0
			for record in SeqIO.parse(fasta_out, 'fasta'):
				last_assid = record.id.strip().split(fasta_delim)[0]
				processed_assembly_ids |= set([last_assid])
				n_processed_ids += 1
		try:
			last_checked_index = assembly_df['#assembly_accession'] == last_assid
			start_index = assembly_df.index[last_checked_index].tolist()[0]                    
		except:
			start_index = 0
		log('OK. Added % 5d sequences.' % (n_processed_ids))
	return (processed_assembly_ids, start_index)

def load_gff_feature_table(filename):
	columns = {
		'assembly': str,
		'genomic_accession':str,
		'start': int,
		'end': int, 
		'strand': str,
		'name': str,
		'attributes': str
	}
	return (
		pd.read_csv(
			filename,
			sep='\t',
			compression='gzip',
			usecols=list(columns.keys()),
			dtype=columns,
			keep_default_na=False,
		)					
	)

def build_file_path(ftp_path, base_path, type):
	suffix = {'features': 'feature_table.txt', 'sequences': 'genomic.fna'}
	return os.path.join(
		base_path,
 		os.path.basename(ftp_path) + '_' + suffix[type] + '.gz',
	)

In [6]:
ass_sub = '../data/archaea_assembly_summary_filter_2023-07-13.txt'

In [40]:
fa_out_name = '../test.fa'

assembly_df = pd.read_csv(
	ass_sub,
	sep='\t',
	usecols=['#assembly_accession', 'ftp_path'],
	dtype={'#assembly_accession': str, 'ftp_path': str}
)

processed_assembly_ids, start_index = add_processed_assembly_ids(assembly_df, fa_out_name)
assembly_df = assembly_df[~assembly_df['#assembly_accession'].isin(processed_assembly_ids)]
assembly_df


Unnamed: 0,#assembly_accession,ftp_path
0,GCF_009428885.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...
1,GCF_009729015.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...
2,GCF_003201835.2,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...
3,GCF_000213215.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...
4,GCF_009729545.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...
...,...,...
1455,GCF_001316265.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...
1456,GCF_001316285.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...
1457,GCF_001748385.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...
1458,GCF_902786095.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/9...


In [172]:
assembly_df.loc[3]

#assembly_accession                                      GCF_000213215.1
ftp_path               https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...
features_path          ../data/features/GCF_000213215.1_ASM21321v1_fe...
sequences_path         ../data/sequences/GCF_000213215.1_ASM21321v1_g...
features_exist                                                      True
sequences_exist                                                     True
Name: 3, dtype: object

In [36]:
def build_paths(ftp_path, base_paths):
	path_prefix = os.path.basename(ftp_path)
	features_paths = os.path.join(base_paths['features'], path_prefix + '_feature_table.txt.gz')
	sequences_paths = os.path.join(base_paths['sequences'], path_prefix + '_genomic.fna.gz')
	return pd.Series(
		[features_paths, sequences_paths],
		index=['features_path', 'sequences_path']
	)

In [106]:
assembly_df[['features_path', 'sequences_path']] = assembly_df['ftp_path'].apply(
    build_paths,
    base_paths = {
        'features': '../data/'+ass_f_dir,
        'sequences': '../data/'+ass_s_dir,
    }
)

In [47]:
def file_exists(filename):
	return os.path.exists(filename) and os.path.getsize(filename) > 0

def filter_missing_files(assembly_df):
	assembly_df['features_exist'] = assembly_df['features_path'].apply(file_exists)
	assembly_df['sequences_exist'] = assembly_df['sequences_path'].apply(file_exists)
	return (
		assembly_df
		.query('features_exist == True & sequences_exist == True')
		.drop(columns=['features_exist', 'sequences_exist'])
	)

In [48]:
filter_missing_files(assembly_df)

Unnamed: 0,#assembly_accession,ftp_path,features_path,sequences_path
0,GCF_009428885.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,../data/features/GCF_009428885.1_ASM942888v1_f...,../data/sequences/GCF_009428885.1_ASM942888v1_...
1,GCF_009729015.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,../data/features/GCF_009729015.1_ASM972901v1_f...,../data/sequences/GCF_009729015.1_ASM972901v1_...
2,GCF_003201835.2,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,../data/features/GCF_003201835.2_ASM320183v2_f...,../data/sequences/GCF_003201835.2_ASM320183v2_...
3,GCF_000213215.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,../data/features/GCF_000213215.1_ASM21321v1_fe...,../data/sequences/GCF_000213215.1_ASM21321v1_g...
4,GCF_009729545.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,../data/features/GCF_009729545.1_ASM972954v1_f...,../data/sequences/GCF_009729545.1_ASM972954v1_...
...,...,...,...,...
1445,GCF_014962245.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,../data/features/GCF_014962245.1_ASM1496224v1_...,../data/sequences/GCF_014962245.1_ASM1496224v1...
1446,GCF_000092185.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,../data/features/GCF_000092185.1_ASM9218v1_fea...,../data/sequences/GCF_000092185.1_ASM9218v1_ge...
1447,GCF_000148385.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,../data/features/GCF_000148385.1_ASM14838v1_fe...,../data/sequences/GCF_000148385.1_ASM14838v1_g...
1449,GCF_001316005.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,../data/features/GCF_001316005.1_ASM131600v1_f...,../data/sequences/GCF_001316005.1_ASM131600v1_...


In [123]:
# Find all genomic_accession for all 16S rRNA

# Load GFF, find 16S records, filter by length
features_filename = assembly_df.loc[3, 'features_path']
sequences_filename = assembly_df.loc[3, 'sequences_path']

def load_16s_records(features_filename, length_thresholds={'min': 500, 'max': 2000}):
	# match_str = '16S ribosomal RNA(?!.*methyl).*'
	match_str = '16S ribosomal RNA.*'
	df = load_gff_feature_table(features_filename)
	features_df = df[df['name'].str.contains(match_str)].copy()
	features_df['length'] = features_df['end'] - features_df['start'] + 1
	features_df = features_df.query(
		f'length >= {length_thresholds["min"]} & length <= {length_thresholds["max"]}'
	)
	return features_df.drop(columns=['attributes'])

features_16s_df = load_16s_records(features_filename)
features_16s_df

Unnamed: 0,assembly,genomic_accession,start,end,strand,name,length
1739,GCF_000213215.1,NC_015518.1,761380,762036,+,16S ribosomal RNA methyltransferase A,657
3035,GCF_000213215.1,NC_015518.1,1296061,1297560,+,16S ribosomal RNA,1500


In [157]:
def create_fasta_metadata(df, delim=';'):
	return (
		'>'
		+ str(df['assembly'])
		+ delim + str(df['genomic_accession'])
		+ delim + 'loc:' + str(df['start']) + ',' + str(df['end']-1)
		+ delim + 'strand:' + df['strand']
		+ delim + 'length' + str(df['length'])
	)

In [173]:
features_16s_df

Unnamed: 0,assembly,genomic_accession,start,end,strand,name,length,metadata,sequence
1739,GCF_000213215.1,NC_015518.1,761380,762036,-,16S ribosomal RNA methyltransferase A,657,">GCF_000213215.1;NC_015518.1;loc:761380,762035...",TTATACATAATTTAATCCCATTAAATTCAATAGTTCTATAACTTGA...
3035,GCF_000213215.1,NC_015518.1,1296061,1297560,+,16S ribosomal RNA,1500,">GCF_000213215.1;NC_015518.1;loc:1296061,12975...",TTCCGGTTGATCCTGCCGGACCCGACCGCTATGGGGGTAGGGCTAA...


In [165]:
features_16s_df['metadata'] = features_16s_df.apply(create_fasta_metadata, axis=1)
features_16s_df['sequence'] = ''
features_16s_df

Unnamed: 0,assembly,genomic_accession,start,end,strand,name,length,metadata,sequence
1739,GCF_000213215.1,NC_015518.1,761380,762036,-,16S ribosomal RNA methyltransferase A,657,">GCF_000213215.1;NC_015518.1;loc:761380,762035...",
3035,GCF_000213215.1,NC_015518.1,1296061,1297560,+,16S ribosomal RNA,1500,">GCF_000213215.1;NC_015518.1;loc:1296061,12975...",


In [125]:
def build_paths(ftp_path, base_paths):
	path_prefix = os.path.basename(ftp_path)
	features_paths = os.path.join(base_paths['features'], path_prefix + '_feature_table.txt.gz')
	sequences_paths = os.path.join(base_paths['sequences'], path_prefix + '_genomic.fna.gz')
	return pd.Series(
		[features_paths, sequences_paths],
		index=['features_path', 'sequences_path']
	)

In [168]:

def extract_sequence_from_fasta_record(fasta_record, row):
	sequence = fasta_record.seq[row['start']:row['end']]
	if row['strand'] == '-':
		sequence = sequence.reverse_complement()
	return str(sequence)

def add_sequences(df, sequences_filename):
	with gzip.open(sequences_filename, 'rt') as sequences_fasta:
		for fasta_record in SeqIO.parse(sequences_fasta, 'fasta'):
			fasta_rows = df['genomic_accession'] == fasta_record.id
			df.loc[fasta_rows, 'sequence'] = df.loc[fasta_rows].apply(
				lambda row: extract_sequence_from_fasta_record(fasta_record, row),
				axis=1,
			)
	return df

add_sequences(features_16s_df, sequences_filename)

Unnamed: 0,assembly,genomic_accession,start,end,strand,name,length,metadata,sequence
1739,GCF_000213215.1,NC_015518.1,761380,762036,-,16S ribosomal RNA methyltransferase A,657,">GCF_000213215.1;NC_015518.1;loc:761380,762035...",TTATACATAATTTAATCCCATTAAATTCAATAGTTCTATAACTTGA...
3035,GCF_000213215.1,NC_015518.1,1296061,1297560,+,16S ribosomal RNA,1500,">GCF_000213215.1;NC_015518.1;loc:1296061,12975...",TTCCGGTTGATCCTGCCGGACCCGACCGCTATGGGGGTAGGGCTAA...


In [169]:
x = features_16s_df.apply(lambda row: row['metadata']+'\n'+row['sequence']+'\n', axis=1).to_list()

In [170]:
x

['>GCF_000213215.1;NC_015518.1;loc:761380,762035;strand:-;length657\nTTATACATAATTTAATCCCATTAAATTCAATAGTTCTATAACTTGACATGGTTTAAACTCCCTCACGCGTTTTTCGCTATTAGATATAAAGCCGCAATATTCTCCTACGTTTTTTAATTTCTTATTCCTAAATCTGCTAACACATTTTAAATACTCGTCTACTTTTGCATTATATGTCCTTTTCCTCATAAAAGTTACTATAATTGAATAAACCTTAGGGCTTGGTGTAAAAGCTGAAGGAGGAATAACTTCGTGAGTTTTAATATCAAAAATATAGTTAAGTAAGAAAGAAATATAAGTAGCATAATTTAATATTTTATCTATAAAATCTTTTTGTAAGATTAAAACTAAACTGATTACTTGATCCAGTTTACTTACTTCCAAGAAAAAATCTTCAGTAATAGAATAAGGTAAGGAGGAGACTATCTGACCTCTCTTTATAGGTAAAAATCTCGCATCTGCGATAATAAGGTTATAAGATTTAAGATATTTAATGAATTTATCATCAATCTCTATGCATAAATCAGGATTAATAACCTTAGAAATATTACCTTTACCGCATCCAACTTCAACGACTGGCTTTATATCATTTTTTACATAGGATGAAAATTTGAAAATAAAAAACTTATCAATCAAAAAATTTTGTGAAAGTTTCA\n',
 '>GCF_000213215.1;NC_015518.1;loc:1296061,1297559;strand:+;length1500\nTTCCGGTTGATCCTGCCGGACCCGACCGCTATGGGGGTAGGGCTAAGCCATGGGAGTCGTACGCCCTCGGGTAAGAGGGCGTGGCGGACGGCTGAGTAACACGTGGCTAACTTACCCTCGGGACCCGGATAACTCCGGGAAACTGGAGCTAATCCGGGGCAGGCGAAGGGTACTGGAACGTCCCTTCGCCTAAAGGGG

In [152]:
features_16s_df['metadata'].iloc[0]

'GCF_000213215.1;NC_015518.1;loc:1739     761380\n3035    1296061\nName: start, dtype: int64,1739     762035\n3035    1297559\nName: end, dtype: int64;strand:+;length1739     657\n3035    1500\nName: length, dtype: int64'

In [1]:
import pandas as pd

from genome_assemblies.summary import (
    load_assembly_summary_table,
    cleanup_infraspecific_names,
    strain_name_from_organism_name,
    add_detailed_strain_designation
)

In [4]:
assembly_summary_filename = '../data/bacteria_assembly_summary_2023-07-13.txt'
assembly_levels = ['Complete Genome', 'Chromosome', 'Scaffold', 'Contig']
refseq_levels = ['reference genome', 'representative genome', 'na']

as_df = load_assembly_summary_table(assembly_summary_filename)	
as_df['infraspecific_name'] = as_df['infraspecific_name'].apply(cleanup_infraspecific_names)
as_df['strain_name'] = as_df['organism_name'].apply(strain_name_from_organism_name)
as_df['assembly_level'] = pd.Categorical(
	as_df['assembly_level'],
	categories=assembly_levels,
	ordered=True,
)
as_df['refseq_category'] = pd.Categorical(
	as_df['refseq_category'],
	categories=refseq_levels,
	ordered=True,
)

as_df

Unnamed: 0,#assembly_accession,refseq_category,taxid,organism_name,infraspecific_name,assembly_level,seq_rel_date,ftp_path,strain_name
0,GCF_900128725.1,na,9,Buchnera aphidicola,BCifornacula,Complete Genome,2016/11/25,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/9...,Buchnera aphidicola
1,GCF_948107705.1,na,9,Buchnera aphidicola,na,Scaffold,2023/05/30,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/9...,Buchnera aphidicola
2,GCF_008244535.1,na,14,Dictyoglomus thermophilum,PYS_80_B,Contig,2019/09/02,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,Dictyoglomus thermophilum
3,GCF_026418735.1,na,14,Dictyoglomus thermophilum,na,Contig,2022/11/27,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,Dictyoglomus thermophilum
4,GCF_003854875.1,na,17,Methylophilus methylotrophus,D22,Chromosome,2018/12/01,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,Methylophilus methylotrophus
...,...,...,...,...,...,...,...,...,...
299256,GCF_030410515.1,na,3058043,Arthrobacter sp. YD4,YD4,Scaffold,2023/07/05,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,Arthrobacter sp. YD4
299257,GCF_030410475.1,na,3058046,Arthrobacter sp. YD2,YD2,Scaffold,2023/07/05,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,Arthrobacter sp. YD2
299258,GCF_030410375.1,na,3058168,Zwartia sp. IMCC34845,IMCC34845,Contig,2023/07/05,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,Zwartia sp. IMCC34845
299259,GCF_029958925.1,na,3058420,Tropicibacter sp. YMD87,YMD87,Complete Genome,2023/05/09,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,Tropicibacter sp. YMD87


In [5]:
as_df['strain_name'] = as_df.apply(add_detailed_strain_designation, axis=1)

In [6]:
as_df = as_df.sort_values(
	by=['assembly_level', 'refseq_category', 'seq_rel_date'],
	ascending=[False, False, False]
)
as_filter_df = (
	as_df
	.groupby('strain_name')
	.first()
	.reset_index(drop=True)
)

as_filter_df

Unnamed: 0,#assembly_accession,refseq_category,taxid,organism_name,infraspecific_name,assembly_level,seq_rel_date,ftp_path
0,GCF_002973605.1,representative genome,1960156,Abditibacterium utsteinense,LMG 29911,Contig,2018/03/06,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...
1,GCF_000160075.2,na,592010,Abiotrophia defectiva ATCC 49176,ATCC 49176,Scaffold,2013/10/31,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...
2,GCF_026783725.1,na,46125,Abiotrophia defectiva,D14035481,Contig,2022/12/09,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...
3,GCF_013267415.1,representative genome,46125,Abiotrophia defectiva,FDAARGOS_785,Complete Genome,2020/06/04,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...
4,GCF_001815865.1,na,1581061,Abiotrophia sp. HMSC24B09,HMSC24B09,Scaffold,2016/10/21,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...
...,...,...,...,...,...,...,...,...
276905,GCF_934667745.1,na,354118,uncultured Victivallis sp.,na,Contig,2022/04/16,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/9...
276906,GCF_946221545.1,na,1161389,uncultured Weeksella sp.,na,Contig,2022/09/12,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/9...
276907,GCF_934230365.1,na,253243,uncultured Weissella sp.,na,Contig,2022/04/16,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/9...
276908,GCF_947502365.1,na,255433,uncultured Zobellia sp.,na,Contig,2023/01/30,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/9...
