# Notebook title

Notebook description

<!-- replace template-for-colab in the url with whatever this notebook is called -->
<a href="https://githubtocolab.com/harmslab/topiary-examples/blob/main/template-for-colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Initialize environment

In [None]:
### THIS CELL SETS UP TOPIARY IN A GOOGLE COLAB ENVIRONMENT. 
### IF RUNNING THIS NOTEBOOK LOCALLY, IT MAY BE SAFELY DELETED

#@title Install software (colab-only)

#@markdown Install the software by pressing the _Play_ button on the left.
#@markdown Please be patient. This will take several minutes. After the 
#@markdown installation is complete, <font color='blue'>the kernel will reboot and colab will
#@markdown complain that the kernel has died. This is normal.</font>

#@markdown If you wish to install raxml or generax, select the check boxes below. 
#@markdown Installing these packages takes a few minutes. Note: you can select
#@markdown the checkboxes and re-run the cell after doing the 
#@markdown initial installation. 

install_raxml = False #@param {type:"boolean"}
install_generax = False #@param {type:"boolean"}

# This assumes python 3.8 (the current colab default, 2022/12/07)
# If colab updates from python 3.8, change:

# 1. Miniconda3-py38_4.12.0-Linux-x86_64.sh
# 2. conda install --channel defaults conda python=3.8 --yes
# 3. Refs to /usr/local/lib/python3.8/site-packages

# Change these in both the "Install" and "Initialize" cells

from tqdm.auto import tqdm
import sys
import subprocess
import time
import os
import re

def run_install_cmd(bash_to_run,description):
    """
    Run an installation command.

    bash_to_run : str
        bash command as a string
    description : str
        description of what is being done
    """

    no_space = re.sub(" ","_",description)
    status_file = f"/content/software/{no_space}.installed"

    if os.path.isfile(status_file):
        print(f"{description} already installed.")
        return

    os.chdir("software")

    print(f"Installing {description}... ",end="",flush=True)
    f = open(f"{no_space}_tmp-script.sh","w")
    f.write(bash_to_run)
    f.close()

    result = subprocess.run(["bash",f"{no_space}_tmp-script.sh"],
                                                    stdout=subprocess.PIPE,
                                                    stderr=subprocess.PIPE,
                                                    text=True)
    
    if result.returncode != 0:
        print(result.stdout,flush=True)
        print(result.stderr,flush=True)
        raise RuntimeError("Installation failed!")

    
    f = open(f"{no_space}_stdout.txt","w")
    f.write(result.stdout)
    f.close()

    f = open(f"{no_space}_stderr.txt","w")
    f.write(result.stderr)
    f.close()

    os.chdir("..")

    f = open(status_file,'w')
    f.write("Installed\n")
    f.close()
    
    print("Complete.",flush=True)

    
miniconda = \
"""
unset PYTHONPATH
MINICONDA_INSTALLER_SCRIPT=Miniconda3-py38_4.12.0-Linux-x86_64.sh
MINICONDA_PREFIX=/usr/local
wget https://repo.anaconda.com/miniconda/$MINICONDA_INSTALLER_SCRIPT --quiet
chmod +x $MINICONDA_INSTALLER_SCRIPT
./$MINICONDA_INSTALLER_SCRIPT -b -f -p $MINICONDA_PREFIX
conda install --channel defaults conda python=3.8 --yes
conda update --channel defaults --all --yes
"""

conda_packages = \
"""
conda config --add channels conda-forge
conda config --add channels bioconda
conda install --channel defaults numpy pandas xlrd openpyxl matplotlib "muscle>=5.0" blast --yes --strict-channel-priority
"""

pip_packages = \
"""
# Install ghostscript binary (so toyplot doesn't complain)
wget https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs1000/ghostscript-10.0.0-linux-x86_64.tgz
tar -zxf ghostscript-10.0.0-linux-x86_64.tgz
mv ghostscript-10.0.0-linux-x86_64/gs-1000-linux-x86_64 /usr/local/bin/gs

/usr/bin/python3 -m pip install mpi4py opentree ete3 dendropy biopython pastml toytree toyplot 
"""

raxml = \
"""
wget https://github.com/amkozlov/raxml-ng/releases/download/1.1.0/raxml-ng_v1.1.0_linux_x86_64.zip
unzip raxml-ng_v1.1.0_linux_x86_64.zip
cp raxml-ng /usr/local/bin
"""

generax = \
"""
apt-get install flex bison libgmp3-dev
rm -rf GeneRax
git clone --recursive https://github.com/BenoitMorel/GeneRax
cd GeneRax
./install.sh
cp build/bin/generax /usr/local/bin
cd ..
"""

topiary = \
"""
rm -rf topiary
git clone --branch main https://github.com/harmsm/topiary.git

cd topiary

# add --allow-run-as-root to mpirun calls
for x in `echo "./topiary/generax/_generax.py ./topiary/generax/_reconcile_bootstrap.py ./topiary/_private/mpi/mpi.py"`; do
    sed -i 's/\[\"mpirun\",/\[\"mpirun\",\"--allow-run-as-root\",/g' ${x}
done

/usr/bin/python3 -m pip install . -vv
cd ..
"""

try:
    import google.colab
    RUNNING_IN_COLAB = True
except ImportError:
    RUNNING_IN_COLAB = False
except Exception as e: 
    err = "Could not figure out if runnning in a colab notebook\n"
    raise Exception(err) from e

if RUNNING_IN_COLAB:

    description_list = ["miniconda","conda packages","pip packages"]
                    
    cmd_list = [miniconda,conda_packages,pip_packages]

    if install_raxml:
        description_list.append("raxml-ng")
        cmd_list.append(raxml)

    if install_generax:
        description_list.append("generax")
        cmd_list.append(generax)

    description_list.append("topiary")
    cmd_list.append(topiary)

    print("Setting up environment.",flush=True)

    # Make software directory (if not already there)
    os.system("mkdir -p software")

    # Add conda path to the current python session
    to_append = "/usr/local/lib/python3.8/site-packages"
    if to_append not in sys.path:
        sys.path.append(to_append)

    # Make sure that any new python session that spools up during the installations
    # has the correct site packages. 
    %env PYTHONSTARTUP=/content/software/python_startup.py
    f = open("/content/software/python_startup.py","w")
    f.write("import sys\n")
    f.write("sys.path.append('/usr/local/lib/python3.8/site-packages')\n")
    f.close()


    # Install each package
    pbar = tqdm(range(len(cmd_list)))
    for i in pbar:
        run_install_cmd(cmd_list[i],description_list[i])

        # Update status bar
        pbar.refresh()
        time.sleep(0.5)
        
    # This sleep step makes sure things are done writing to the display before
    # reset
    time.sleep(2)
    os._exit(0)


In [None]:
import topiary
import numpy as np
import pandas as pd 

### EVERYTHING AFTER THIS LINE IS IS USED TO SET UP TOPIARY IN A GOOGLE
### COLAB ENVIRONMENT. IF RUNNING THIS NOTEBOOK LOCALLY, THE LINES BELOW
### IN THIS CELL MAY BE SAFELY DELETED. 

#@title Initialize environment

#@markdown  Run this cell to initialize the environment after installation.
#@markdown (This cell can also be run if the kernel dies during a calculation,
#@markdown allowing you to reload modules without having to
#@markdown reinstall). 

try:
    import google.colab
    RUNNING_IN_COLAB = True
except ImportError:
    RUNNING_IN_COLAB = False
except Exception as e: 
    err = "Could not figure out if runnning in a colab notebook\n"
    raise Exception(err) from e

# Generic setup
if RUNNING_IN_COLAB:
    
    %env PYTHONPATH=""
    %env PYTHONSTARTUP=/content/software/python_startup.py
    %env TOPIARY_MAX_SLOTS=1

    topiary._in_notebook = "colab"

    from tqdm.auto import tqdm
    import sys
    import subprocess
    import time
    import os

    to_append = '/usr/local/lib/python3.8/site-packages'
    if to_append not in sys.path:
        sys.path.append(to_append)

    os.chdir("/content/")

#@markdown Set a working directory on google drive if desired. This should
#@markdown be relative to the directory you see when you go
#@markdown to https://drive.google.com (MyDrive). For example, you could upload
#@markdown the file `alignment.csv` to a directory called `topiary_work` on your 
#@markdown google drive. Topiary will then run in that directory, meaning you 
#@markdown can access that file in the notebook as `alignment.csv`. This
#@markdown also means the output of topiary will all be saved in `topiary_work`.
#@markdown <br/><br/>
#@markdown To work in a temporary colab environment, leave this blank. It is
#@markdown highly recommended that you set a google drive directory. 

# Select a working directory on google drive
google_drive_directory = "topiary_work" #@param {type:"string"}

# Set up google drive
if RUNNING_IN_COLAB and google_drive_directory:

    from google.colab import drive
    drive.mount('/content/gdrive/')

    working_dir = f"/content/gdrive/MyDrive/{google_drive_directory}"
    os.system(f"mkdir -p {working_dir}")
    os.chdir(working_dir)
    
print(f"Working directory: {os.getcwd()}")

## Tests

<font color='red'>This cell and below should be deleted when using this as a template notebook. (Note: all tests should run on both colab and a local computer.)</font>

### Generate an alignment from a seed dataframe

In [None]:
df_location = "https://raw.githubusercontent.com/harmslab/topiary-examples/main/data/example-seed.csv"
df = pd.read_csv(df_location)
df

In [None]:
topiary.seed_to_alignment(df_location)

### Get ancestors given an alignment

In [None]:
tiny_df_location = "https://raw.githubusercontent.com/harmslab/topiary-examples/main/data/tiny-phylo/initial-input/dataframe.csv"

topiary.alignment_to_ancestors(tiny_df_location,out_dir="ali-to-anc")

In [None]:
topiary.pipeline.bootstrap_reconcile("ali-to-anc",2,overwrite=True)

### Look at ancestral reconstruction output

In [None]:
topiary.draw.tree("ali-to-anc/05_reconcile-bootstraps")

In [None]:
anc_fasta = "ali-to-anc/03_ancestors/output/reconciled-tree_ancestors/ancestors.fasta"
with open(anc_fasta as f):
    for line in f:
        print(f,end="")

In [None]:
df = pd.read_csv("ali-to-anc/03_ancestors/output/reconciled-tree_ancestors/ancestor-data.csv")
df[df.anc == "anc1"]

In [None]:
topiary.alignment_to_ancestors("seed_to_alignment_vetsAEDotS/05_clean-aligned-dataframe.csv")