# Actinipterygii order-level UMAP

This notebook performs UMAP on an order-level basis to construct point locations. Each
group of points is then grafted onto the scaffold constructed in the previous notebook,
`130 Fish species tree scaffold.ipynb`.

The first iteration of this tried to use the tree to build a monophyletic grouping by
finding the MRCA of all the genera in an order. This does not seem to work correctly, so
we're going to use the main distance matrix constructed in 104/105 and just grab taxa
from that. 

In [None]:
# Packages.
import pandas as pd
import numpy as np
import umap
import plotly.express as px

In [None]:
# Load in the main distance matrix and the taxonomy list we made earlier.
dist_matrix = pd.read_csv("output/Actinopterygii_tree_distance_matrix_py.csv", index_col=0)
taxonomy_list = pd.read_csv("output/Actinopterygii_genus_order_family_taxon.csv", index_col=0)

In [None]:
# Change the index of the taxonomy_list to be the taxon name.
taxonomy_list.index = taxonomy_list['taxon']
taxonomy_list.index.name = 'taxon'

In [None]:
# Make a list of all the fish orders. At some point, it would be useful to
# have a Misof-style tree of fish orders in phylogenetic order from most
# basal to most derived, but for now we'll just use the order they appear in the
# taxonomy list.
fish_orders = taxonomy_list['order'].unique()

In [None]:
# Let's make an output dir for the order-level t-SNE results.
import pathlib

umap_by_order_output_dir = pathlib.Path('output/umap_by_order')

umap_by_order_output_dir.mkdir(exist_ok=True)

## UMAP for each order

We need to run UMAP for each order individually. Let's make a list of all the orders, then make a function that runs UMAP on one order. We'll then loop over all the orders.

In [None]:
# Let's make a quick table with the number of taxa in each order.
order_counts = taxonomy_list['order'].value_counts().reset_index()
order_counts.columns = ['order', 'count']
order_counts = order_counts.sort_values('count', ascending=False)
print(order_counts)

In [None]:
# Pufferfishes (Tetraodontiformes) is in the middle, so we'll use that as a test.

current_order = 'Tetraodontiformes'  # Change this to process a different order

def do_umap_for_order(current_order):

    print(f"Processing order: {current_order}")

    # Get the list of taxa in this order.
    taxa_in_order = taxonomy_list[taxonomy_list['order'] == current_order]['taxon'].tolist()
    print(f"Number of taxa in {current_order}: {len(taxa_in_order)}")

    # Now filter the distance matrix to only include those taxa.
    filtered_matrix = dist_matrix.loc[taxa_in_order, taxa_in_order]
    print(f"Filtered matrix shape for {current_order}: {filtered_matrix.shape}")

    # Now run UMAP on this distance matrix.
    print("Running UMAP...", end='', flush=True)
    n_neighbors = min(15, len(taxa_in_order) - 1)
    if n_neighbors < 2:
        print(f"Not enough genera ({len(taxa_in_order)}) for UMAP. Skipping...")
        return

    print(f"Using n_neighbors: {n_neighbors}...", end='', flush=True)
    umap_model = umap.UMAP(n_components=2, 
                           n_neighbors=n_neighbors,
                           min_dist=1.0,
                           metric='precomputed')
    df_umap = umap_model.fit_transform(filtered_matrix)
    print("done.")

    # Dump the random state for reproducibility.
    print(f"Random state: {umap_model.random_state}")

    df_umap = pd.DataFrame( df_umap , index = filtered_matrix.index , columns = list('xy')) 
    df_umap.index.name = 'taxon'

    # Let's add the order and family information back in. Merge based on 'taxon' in both dataframes.
    # First make sure the index is named 'taxon' in both dataframes.
    taxonomy_list.index.name = 'taxon'
    df_umap = df_umap.merge(taxonomy_list[['order', 'family']], left_index=True, right_index=True)

    output_path = umap_by_order_output_dir / f"{current_order}_2D_UMAP.csv"

    df_umap.to_csv(output_path)

do_umap_for_order(current_order)

## Loop over all orders

Works, now run it on everything.

In [None]:
for current_order in fish_orders:
    do_umap_for_order(current_order)

## Plotting

What do these look like? Do one at a time, selecting what we want.

In [None]:
current_order = 'Tetraodontiformes'  # Change this to visualize a different order

df_umap = pd.read_csv(umap_by_order_output_dir / f"{current_order}_2D_UMAP.csv", index_col=0)
# Create the scatter plot.

fig = px.scatter(df_umap, x='x', y='y', color='family', hover_name=df_umap.index)
fig.update_layout(title=f"2D UMAP of {current_order} Genera", xaxis_title="UMAP1", yaxis_title="UMAP2")
fig.update_layout(height=800, width=800)
fig.show()

In [None]:
current_order = 'Siluriformes'  # Change this to visualize a different order
df_umap = pd.read_csv(umap_by_order_output_dir / f"{current_order}_2D_UMAP.csv", index_col=0)
# Create the scatter plot.

fig = px.scatter(df_umap, x='x', y='y', color='family', hover_name=df_umap.index)
fig.update_layout(title=f"2D UMAP of {current_order} Genera", xaxis_title="UMAP1", yaxis_title="UMAP2")
fig.update_layout(height=800, width=800)
fig.show()

# Overlapping points

For each order, let's see how many points overlap in the UMAP plot.


In [None]:
# Load in the CSV for each one and count the number of coincident points (identical x,y coordinates).

for current_order in fish_orders:
    order_path = umap_by_order_output_dir / f"{current_order}_2D_UMAP.csv"
    try:
        df_umap = pd.read_csv(order_path, index_col=0)
        coord_counts = df_umap.groupby(['x', 'y']).size()
        num_coincident = (coord_counts > 1).sum()
        total_points = len(df_umap)
        print(f"{current_order}: {num_coincident} coincident points out of {total_points} total points.")
    except FileNotFoundError:
        print(f"File not found for order: {current_order}")
        continue
