<a href="https://colab.research.google.com/github/lauraluebbert/delphy_workflows/blob/main/lassa_workflow_Nisha.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download data from [NCBI Virus](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/) using [gget](https://pachterlab.github.io/gget/)
___

## 1. Select your virus of interest and apply filters to the genomes downloaded from NCBI virus

In [1]:
virus = 'Mammarenavirus lassaense' # Examples: 'Norovirus' or 'coronaviridae' or 'NC_045512.2' or '142786' (Norovirus taxid)
accession = False                  # If 'virus' is an NCBI accession instead of a taxon (e.g. 'NC_045512.2'), set this to True

# Commonly used filtering options (set any filter to None to turn off the filter):
host = 'homo sapiens'             # Example: 'homo sapiens' (alternatively: use the host_taxid filter below)
min_seq_length = None             # Example: 6252
max_seq_length = None             # Example: 7815
has_proteins = 'S'                # Example: 'GPC' or 'L' or ['GPC', 'L'] (also accepts genes or segments)
proteins_complete = False         # True or False (indicates whether the proteins/genes/segments in has_proteins should be marked 'complete')

geographic_location = None        # Example: 'South_Africa' or 'Germany'
min_collection_date = None        # Example: '2000-01-01'
max_collection_date = None        # Example: '2014-12-04'

# Additional filtering options:
min_gene_count = None             # Example: 1
max_gene_count = None             # Example: 40
nuc_completeness = None           # 'partial' or 'complete'
host_taxid = None                 # Example: 9443 (NCBI Taxonomy ID of all primates)
lab_passaged = None               # True or False (indicates whether the virus sequence has been passaged in a laboratory setting)
geographic_region = None          # Example: 'Africa' or 'Europe'
submitter_country = None          # Example: 'South_Africa' or 'Germany'
annotated = None                  # True or False (indicates whether the virus genome sequence should be annotated)
source_database = None            # Example: 'GenBank' or 'RefSeq'
min_release_date = None           # Example: '2000-01-01'
max_release_date = None           # Example: '2014-12-04'
min_mature_peptide_count = None   # Example: 2
max_mature_peptide_count = None   # Example: 15
min_protein_count = None          # Example: 2
max_protein_count = None          # Example: 15
max_ambiguous_chars = None        # Example: 10

## 2. Upload a reference fasta file
1. Click on the folder icon on the left
2. Upload your file to the Google Colab server by dragging in your file (or use rightclick -> Upload)
3. Specify the name of your file here:

In [2]:
reference = "your_reference.fasta"

## 3. Click on 'Runtime' -> 'Run all' and lean back
___

### Installing gget:

In [None]:
# After the release, this will just be: pip install gget
!pip install -q mysql-connector-python==8.0.29 biopython
!pip install -q --log log git+https://github.com/pachterlab/gget.git@delphy_dev

import gget

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.2/25.2 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone


Full descriptions for the filtering options:

In [None]:
help(gget.ncbi_virus)

### Downloading virus genomes from NCBI Virus:

This might take a minute depending on the internet connection and how busy the NCBI server is.

In [None]:
%%time
gget.ncbi_virus(
    virus = virus,
    accession = accession,
    host = host,
    min_seq_length = min_seq_length,
    max_seq_length = max_seq_length,
    min_gene_count = min_gene_count,
    max_gene_count = max_gene_count,
    nuc_completeness = nuc_completeness,
    has_proteins = has_proteins,
    proteins_complete = False,
    host_taxid = host_taxid,
    lab_passaged = lab_passaged,
    geographic_region = geographic_region,
    geographic_location = geographic_location,
    submitter_country = submitter_country,
    min_collection_date = min_collection_date,
    max_collection_date = max_collection_date,
    annotated = annotated,
    source_database = source_database,
    min_release_date = min_release_date,
    max_release_date = max_release_date,
    min_mature_peptide_count = min_mature_peptide_count,
    max_mature_peptide_count = max_mature_peptide_count,
    min_protein_count = min_protein_count,
    max_protein_count = max_protein_count,
    max_ambiguous_chars = max_ambiguous_chars
)

___
# Show metadata

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
metadata = pd.read_csv(f"{'_'.join(virus.split(' '))}_metadata.csv")
metadata

___
# Align viral sequences to reference and return identity percentages
NOTE: All of the code below will be wrapped into a new gget.mafft module, so the cells below will become one command: `gget.mafft(query_fasta, reference_fasta)`

In [None]:
%%time
#Installing MAFFT
!apt-get install -qq -y mafft

# Aligning sequences to each other using mafft

query_fasta = f"{virus}_sequences.fasta"
reference_fasta = reference
mafft_output = f"{virus}_alignment.fasta"

!mafft \
  --quiet \
  --auto \
  --thread 2 \
  $reference_fasta $query_fasta > $mafft_output

In [None]:
# Code to compute identity percentages and other stats from mafft alignment output
import pandas as pd

def read_fasta(file_path):
    sequences = {}
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_data = []

        for line in file:
            line = line.strip()
            if line.startswith(">"):
                if sequence_id:
                    sequences[sequence_id] = ''.join(sequence_data)
                sequence_id = line[1:]  # Remove the ">" and get the sequence ID
                sequence_data = []
            else:
                sequence_data.append(line)

        if sequence_id:
            sequences[sequence_id] = ''.join(sequence_data)

    return sequences


def calculate_identity(seq1, seq2):
    """Calculates identity percentage, number of matches, and gaps between two sequences."""
    if len(seq1) != len(seq2):
        raise ValueError("The aligned sequences must be of equal length")

    matches = sum(res1 == res2 for res1, res2 in zip(seq1, seq2) if res1 != '-' and res2 != '-')
    total_positions = sum(res1 != '-' and res2 != '-' for res1, res2 in zip(seq1, seq2))
    gaps = sum(res1 == '-' or res2 == '-' for res1, res2 in zip(seq1, seq2))

    identity_percentage = (matches / total_positions) * 100 if total_positions > 0 else 0
    return identity_percentage, matches, gaps, len(seq1), len(seq2)


def calculate_multiple_identity(alignment_file):
    sequences = read_fasta(alignment_file)

    # Ensure at least one reference and one query sequence
    if len(sequences) < 2:
        raise ValueError("The alignment file must contain at least two sequences")

    # Assume the first sequence is the reference
    reference_id = list(sequences.keys())[0]
    reference_seq = sequences[reference_id]

    # Prepare results as a list of dictionaries (to convert to DataFrame later)
    results = []

    for query_id, query_seq in list(sequences.items())[1:]:
        identity_percentage, matches, gaps, ref_length, query_length = calculate_identity(reference_seq, query_seq)

        result = {
            "Query_ID": query_id,
            "Reference_ID": reference_id,
            "Identity_Percentage": identity_percentage,
            "Exact_Matches": matches,
            "Gaps": gaps,
            "Query_Length": query_length,
            "Reference_Length": ref_length,
        }
        results.append(result)

    # Convert the list of dictionaries into a pandas DataFrame
    df = pd.DataFrame(results)
    return df

In [None]:
# Compute and save identity percentages
identity_df = calculate_multiple_identity(mafft_output)
# Save results to CSV
identity_df.to_csv(f"{virus}_alignment_results.csv", index=False)

identity_df

___

## All done! 🎉
To download the files we generated in this notebook to your local computer, click on the folder icon on the left and download files by right clicking a file of interest and selecting 'Download'.