# Setup

In [None]:
import os
import shutil
import urllib.request, urllib.parse, urllib.error
from functools import reduce

import numpy as np
import pandas as pd

import plotly.express as px
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt

from tqdm.notebook import tqdm

In [None]:
FILTERED_SUMMARY = '../../data/metadata/filtered_species_summary.csv'
FILTERED_METADATA = '../../data/metadata/filtered_species_metadata.csv'

In [None]:
filtered_species_summary = pd.read_csv(FILTERED_SUMMARY, index_col=0, dtype='object')
filtered_species_metadata = pd.read_csv(FILTERED_METADATA, index_col=0, dtype='object')

# Ensure genome_id is a str
filtered_species_summary['genome_id'] = filtered_species_summary['genome_id'].astype('str')
filtered_species_metadata['genome_id'] = filtered_species_metadata['genome_id'].astype('str')

display(
    filtered_species_summary.shape,
    filtered_species_summary.head()
)

In [None]:
download_paths = '../../data/raw/genomes/'

In [None]:
# Set of genome ids to download 
dl_strains = set(filtered_species_summary.genome_id.astype(str))

# Any previously downloaded strains in directory that do not meet curent filtration criteria
# This only matters when you are re-running this notebook/workflow
bad_dl = list()

for folder in os.listdir(download_paths):
    if folder in dl_strains:
        dl_strains.remove(str(folder))        
    else:
        bad_dl.append(str(folder))

In [None]:
len(dl_strains) # if len is 0, all strains are already downloaded

In [None]:
len(bad_dl) # if len is 0, no bad downloads need to be removed

In [None]:
# delete bad_dl strains
# Only matters on a rerun
for folder in tqdm(bad_dl):
    print('removing folder:', folder)
    shutil.rmtree(os.path.join(download_paths, folder))

# Download (first-pass) filtered genomes

In [None]:
# The file types that can be downloaded from BV-BRC
VALID_BV_BRC_FILES = [
    'faa','features.tab','ffn','frn','gff','pathway.tab', 'spgene.tab','subsystem.tab','fna'
]

def download_bv_brc_genomes(genomes, output_dir, filetypes=['fna','faa','gff','spgene.tab'], redownload=False):
    '''
    Download data associated with a list of PATRIC genomes.
    
    Parameters
    ----------
    genomes : list
        List of strings containing PATRIC genome IDs to download
    output_dir : str
        Path to directory to save genomes. Will create a subfolder 
        for each genome in this directory.
    filetypes : str
        List of BV-BRC genome-specific files to download per genome. 
        Valid options include 'faa', 'features.tab', 'ffn', 'frn',
        'gff', 'pathway.tab', 'spgene.tab', 'subsystem.tab', and
        'fna'. 'PATRIC' in filename is dropped automatically.
        See ftp://ftp.bvbrc.org/genomes/<genome id>/ for
        examples (default ['fna','faa','gff','spgene.tab'])
    redownload : bool
        If True, re-downloads files that exist locally (default False)
    '''
    bad_genomes = []
    
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    
    # Process filetypes
    source_target_filetypes = []
    for ftype in tqdm(filetypes, desc='Processing filetypes...'):
        if ftype in VALID_BV_BRC_FILES: # valid file type
            ftype_source = 'PATRIC.' + ftype if ftype != 'fna' else ftype # all files except FNA preceded by 'PATRIC'
            ftype_target = ftype # drop 'PATRIC' in output files
            source_target_filetypes.append( (ftype_source, ftype_target) )
        
        elif ftype.replace('PATRIC.','') in VALID_BV_BRC_FILES: # valid file type without PATRIC label
            ftype_source = ftype # keep 'PATRIC' for downloading files
            ftype_target = ftype.replace('PATRIC.','') # drop 'PATRIC' in output files
            source_target_filetypes.append( (ftype_source, ftype_target) )
        
        else: # invalid filetype
            print('Invalid filetype:', ftype)
    
    # Download relevant files
    for i, genome in tqdm(enumerate(genomes), desc='Downloading selected files...', total=len(genomes)):
        # Set up source and target locations
        genome_source = 'ftp://ftp.bvbrc.org/genomes/' + genome + '/' + genome # base link to genome files
        genome_dir = output_dir + '/' + genome + '/' # genome-specific output directory
        genome_dir = genome_dir.replace('//','/')
        genome_target = genome_dir + genome # genome-specific output base filename
        if not os.path.exists(genome_dir):
            os.mkdir(genome_dir)
        
        # Process individual files
        try:
            for source_filetype, target_filetype in source_target_filetypes:
                source = genome_source + '.' + source_filetype
                target = genome_target + '.' + target_filetype
                if os.path.exists(target) and not redownload:
                    print(i+1, 'Already exists:', target)
                else:
                    print(i+1, source, '->', target)
                    urllib.request.urlretrieve(source, target)
                    urllib.request.urlcleanup()
        
        # genome ID not found or not all files can be downloaded
        except IOError: # genome ID not found
            print('Bad genome ID:', genome)
            #os.rmdir(genome_dir)
            shutil.rmtree(genome_dir)
            bad_genomes.append(genome)
    
    return bad_genomes



In [None]:
# Files have already been moved into the mash directory, no need to move them
RAW_GENOMES = '../../data/raw/mash_genomes/'

items = []
item_paths = []

for item in os.listdir(RAW_GENOMES):
    curr_path = os.path.join(RAW_GENOMES, item)
    if os.path.isdir(curr_path):
        curr_fna = os.path.join(curr_path, f'{item}.fna')
        items.append(item)
        item_paths.append(curr_fna)


display(
    items[:5],
    item_paths[:5]
)

In [None]:
bad_genomes = download_bv_brc_genomes(genomes=dl_strains, output_dir=download_paths, redownload=False, filetypes=['fna','faa','gff'])

In [None]:
len(bad_genomes)

In [None]:
dl_success = sorted(dl_strains - set(bad_genomes))
len(dl_success)

### Remove genomes with empty fasta files

In [None]:
# some fasta files on BVBRC are empty, remove those and remove them from metadata

## command to remove files with no text in them, list made by checking file lenth in terminal
## commented out since it has been run already

count = 0
for folder in os.listdir(download_paths):
    size = os.path.getsize(download_paths + '/' + folder + '/' + folder + '.fna' )
    if size == 0:
        count += 1
        bad_genomes.append(folder)
        print('Removing folder:', folder)
        shutil.rmtree(os.path.join(download_paths, folder))
print("Empty downloads total:", count)


# Update `summary` & `metadata`

In [None]:
filtered_species_summary['genome_id'] = filtered_species_summary.genome_id.astype('str')
downloaded_genomes = set(filtered_species_summary.genome_id) - set(bad_genomes)

filtered_species_summary = (filtered_species_summary.
                            drop_duplicates(subset=['genome_id']).
                            set_index('genome_id').
                            loc[sorted(downloaded_genomes)].
                            reset_index())


display(
    filtered_species_summary.shape,
    filtered_species_summary.head()
)

In [None]:
filtered_species_metadata['genome_id'] = filtered_species_metadata.genome_id.astype('str')

filtered_species_metadata = (filtered_species_metadata.
                            drop_duplicates(subset=['genome_id']).
                            set_index('genome_id').
                            loc[sorted(downloaded_genomes)].
                            reset_index())


display(
    filtered_species_metadata.shape,
    filtered_species_metadata.head()
)

In [None]:
(filtered_species_summary.genome_status == 'Complete').sum() # Num of complete seqs

In [None]:
filepath = os.path.join(FILTERED_SUMMARY.split('filtered_species_summary.csv')[0], 'filtered_downloaded_species_summary.csv')
filtered_species_summary.to_csv(filepath)

In [None]:
filepath = os.path.join(FILTERED_METADATA.split('filtered_species_metadata.csv')[0], 'filtered_downloaded_species_metadata.csv')
filtered_species_metadata.to_csv(filepath)