In [None]:
import topiary
import pandas as pd
import numpy as np

### Read BLAST XML and download sequences from NCBI

Topiary generally starts with BLAST results downloaded from the NCBI as XML files. Topiary creates a dataframe from these files and automatically downloads all of the sequences from the NCBI. 

In [None]:
xml_file = "../data/tiny.xml"
f = open(xml_file)
lines = f.readlines()
f.close()

print("NCBI BLAST XML file looks like:\n")
print("".join(lines[:5]))
print("...\n...\n...")
print("".join(lines[-5:]))

In [None]:
# Load the xml file into a dataframe
df = topiary.ncbi_blast_xml_to_df(xml_file) # <- can specify a list of xml files to load multiple results
df

### Assign human readable nicknames to sequences
Working with full NCBI names is a pain! Topiary can assign a nickname to each sequence based on user-specified patterns.

In [None]:
alias_dictionary = {"LY96":("lymphocyte antigen 96","MD2","MD-2"),
                    "LY86":("lymphocyte antigen 86","MD1","MD-1")}
df = topiary.create_nicknames(df,aliases=alias_dictionary)
df

### Find unique species identifiers from Open Tree of Life

In an ASR project, we usually want to compare our protein tree to a species tree. Using topiary, we can pull down unique taxonomic identifiers (OTTs) and a species tree from the Open Tree of Life database.

In [None]:
df = topiary.get_ott_id(df,phylo_context="Animals")
df.loc[:,["nickname","species","ott","keep"]]

In [None]:
species_tree = topiary.get_species_tree(df)
topiary.draw.species_tree(species_tree,width=100)

### Check sequence identities using reverse BLAST

In [None]:
# Command to blast against NCBI nr database, selecting only human. To search
# based on more one taxid, pass a list of taxid
# df = topiary.reverse_blast(df,
#                            call_dict={"LY96":["lymphocyte antigen 96","esop1"],
#                                       "LY86":"lymphocyte antigen 86"},
#                            ncbi_rev_blast_db="nr",taxid=9606)

# Reverse blast against a local database
df = topiary.reverse_blast(df,
                           call_dict={"LY96":["lymphocyte antigen 96","esop1"],
                                      "LY86":["lymphocyte antigen 86"]},
                           local_rev_blast_db="GRCh38") 

df.loc[:,["nickname","species","reverse_found_paralog","reverse_paralog","keep"]]

### Lower redundancy of sequences

In [None]:
# will preferentially keep key species when lowering redundancy
key_species = ["Homo sapiens","Monodelphis domestica"] 
df = topiary.remove_redundancy(df,cutoff=0.85,key_species=key_species)

df.loc[:,["nickname","species","keep"]]

### Align the sequences using MUSCLE

In [None]:
df = topiary.run_muscle(df)
df.loc[:,["nickname","species","keep","name","alignment"]]

### Manually edit alignment, then load back into dataframe

<h4>We <span style="color:red">strongly</span> recommend visually inspecting and editing the alignment.</h4> 

To do so, you can load the fasta file written out by muscle into an alignment editor like [AliView](https://ormbunkar.se/aliview/). Once you've done this and saved out a fasta file with the edited alignment, you can load this directly into the topiary dataframe. 

In [None]:
# Write the alignment into a fasta file
topiary.write_fasta(df,"raw-alignment.fasta",seq_column="alignment")

<h4><span style="color:black">EDIT YOUR ALIGNMENT AND SAVE OUT A NEW FASTA FILE</span></h4>

In the cell below, change `manually_edited_fasta_file_name` to be the name of your saved file.

In [None]:
manually_edited_fasta_file_name = "raw-alignment.fasta" # <- change this filename

df = topiary.read_fasta_into(df,manually_edited_fasta_file_name)

df.loc[:,["nickname","species","keep","name","alignment"]]

In [None]:
#["JTT","LG","WAG","LG4M","LG4X","PROTGTR"]
topiary.find_best_model(df,
                        model_matrices=["JTT","LG"],
                        model_rates=[""],
                        model_freqs=[""],
                        model_invariant=[""],
                        output="find-model",
                        overwrite=True)

In [None]:
topiary.generate_ml_tree(previous_dir="find-model",
                         output="ml-tree",
                         overwrite=True,bootstrap=False)

In [None]:
topiary.reconcile(previous_dir="ml-tree",
                  output="reconciled",
                  overwrite=True)

In [None]:
topiary.generate_ancestors(previous_dir="reconciled",
                           output="ancestors",
                           overwrite=True)

In [None]:
topiary.draw.ancestor_tree("ancestors/") 

In [None]:
topiary.draw.tree("ml-tree")

In [None]:
topiary.draw.tree("reconciled/")

In [None]:
topiary.draw.tree("ancestors/")

In [None]:
topiary.draw.tree("find-model/")