<a href="https://colab.research.google.com/github/lauraluebbert/delphy_workflows/blob/main/one_seq_per_virus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get the longest sequence for each human-infecting virus from [NCBI Virus](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/) using [gget](https://pachterlab.github.io/gget/)
___

## 1. Select your virus of interest and apply filters to the genomes downloaded from NCBI virus

In [5]:
virus = 10239                      # 10239 is the NCBI taxid for all viruses
accession = False                  # If 'virus' is an NCBI accession instead of a taxon (e.g. 'NC_045512.2'), set this to True

# Commonly used filtering options (set any filter to None to turn off the filter):
host = 'homo sapiens'             # Example: 'homo sapiens' (alternatively: use the host_taxid filter below)
min_seq_length = None             # Example: 6252
max_seq_length = None             # Example: 7815

has_proteins = None               # Example: 'GPC' or 'L' or ['GPC', 'L'] (also accepts genes or segments)
proteins_complete = False         # True or False (indicates whether the proteins/genes/segments in has_proteins should be marked 'complete')

geographic_location = None        # Example: 'South_Africa' or 'Germany'
min_collection_date = None        # Example: '2000-01-01'
max_collection_date = None        # Example: '2014-12-04'
max_ambiguous_chars = None        # Example: 10

# Additional filtering options:
min_gene_count = None             # Example: 1
max_gene_count = None             # Example: 40
nuc_completeness = None           # 'partial' or 'complete'
host_taxid = None                 # Example: 9443 (NCBI Taxonomy ID of all primates)
lab_passaged = None               # True or False (indicates whether the virus sequence has been passaged in a laboratory setting)
geographic_region = None          # Example: 'Africa' or 'Europe'
submitter_country = None          # Example: 'South_Africa' or 'Germany'
annotated = None                  # True or False (indicates whether the virus genome sequence should be annotated)
source_database = None            # Example: 'GenBank' or 'RefSeq'
min_release_date = None           # Example: '2000-01-01'
max_release_date = None           # Example: '2014-12-04'
min_mature_peptide_count = None   # Example: 2
max_mature_peptide_count = None   # Example: 15
min_protein_count = None          # Example: 2
max_protein_count = None          # Example: 15

## 2. Click on 'Runtime' -> 'Run all' and lean back
___

### Installing gget:

In [6]:
# After the release, this will just be: pip install gget
!pip install -q mysql-connector-python==8.0.29 biopython
!pip install -q --log log git+https://github.com/pachterlab/gget.git@delphy_dev

import gget

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gget 0.29.1 requires mysql-connector-python>=8.0.32, but you have mysql-connector-python 8.0.29 which is incompatible.[0m[31m
[0m  Preparing metadata (setup.py) ... [?25l[?25hdone


Full descriptions for the filtering options:

In [7]:
help(gget.ncbi_virus)

Help on function ncbi_virus in module gget.gget_ncbi_virus:

ncbi_virus(virus, accession=False, outfolder=None, host=None, min_seq_length=None, max_seq_length=None, min_gene_count=None, max_gene_count=None, nuc_completeness=None, has_proteins=None, proteins_complete=False, host_taxid=None, lab_passaged=None, geographic_region=None, geographic_location=None, submitter_country=None, min_collection_date=None, max_collection_date=None, annotated=None, source_database=None, min_release_date=None, max_release_date=None, min_mature_peptide_count=None, max_mature_peptide_count=None, min_protein_count=None, max_protein_count=None, max_ambiguous_chars=None)
    Download a virus genome dataset from the NCBI Virus database (https://www.ncbi.nlm.nih.gov/labs/virus/).
    
    Args:
    - virus                Virus taxon or accession, e.g. 'Norovirus' or 'coronaviridae' or
                           '11320' (taxid of Influenza A virus) or 'NC_045512.2'
                           If this input is a v

### Downloading virus genomes from NCBI Virus:

This might take a minute depending on the internet connection and how busy the NCBI server is.

In [None]:
%%time
gget.ncbi_virus(
    virus = virus,
    accession = accession,
    host = host,
    min_seq_length = min_seq_length,
    max_seq_length = max_seq_length,
    min_gene_count = min_gene_count,
    max_gene_count = max_gene_count,
    nuc_completeness = nuc_completeness,
    has_proteins = has_proteins,
    proteins_complete = proteins_complete,
    host_taxid = host_taxid,
    lab_passaged = lab_passaged,
    geographic_region = geographic_region,
    geographic_location = geographic_location,
    submitter_country = submitter_country,
    min_collection_date = min_collection_date,
    max_collection_date = max_collection_date,
    annotated = annotated,
    source_database = source_database,
    min_release_date = min_release_date,
    max_release_date = max_release_date,
    min_mature_peptide_count = min_mature_peptide_count,
    max_mature_peptide_count = max_mature_peptide_count,
    min_protein_count = min_protein_count,
    max_protein_count = max_protein_count,
    max_ambiguous_chars = max_ambiguous_chars
)

In [None]:
# Keep only the longest sequence for each virus
!pip install -q biopython
from Bio import SeqIO
import pandas as pd

df_meta = pd.read_csv(f"/content/{'_'.join(virus.split(' '))}_metadata.csv")

sort_by = 'Length'
acc_to_keep = []
for virus in df_meta["Organism Name"].unique():
  acc_to_keep.append(df_meta[df_meta["Organism Name"] == virus].sort_values(sort_by, ascending=False)["accession"].values[0])

def filter_fasta_by_accessions(fasta_file, accessions, output_file):
    # Open the output file to write filtered sequences
    with open(output_file, 'w') as output_handle:
        # Iterate over the sequences in the FASTA file
        for record in SeqIO.parse(fasta_file, "fasta"):
            # Check if the accession (ID) is in the list
            if record.id in accessions:
                # Write the record to the output file
                SeqIO.write(record, output_handle, "fasta")

fasta_input = f"/content/{'_'.join(virus.split(' '))}_sequences.fasta"
fasta_output = f"/content/{'_'.join(virus.split(' '))}_sequences_filtered.fasta"

filter_fasta_by_accessions(fasta_input, acc_to_keep, fasta_output)

___
# All done! 🎉

### To download the files we generated in this notebook to your local computer, click on the folder icon on the left and download files by right clicking a file of interest and selecting 'Download'.