# Ray-finned Fish species tree statistics and scaffold

Dimensionality Reduction/clustering/etc on the full fish dataset is very messy. It does
not appear to have the same issues as the insect dataset (very early diverging lineages,
as in Archaeognatha) but it's still a bit of a mess (as expected).

Here we sub-sample on an order-by-order basis to make an order-level scaffold. We will
then run MDS (or whatever) in 2D on each group, then graft those onto the order-level
scaffold tree.

In [None]:
import ete3
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.manifold import MDS

In [None]:
# Distance matrix. This is in a separate cell because it takes a little while to
# load and doesn't need to be reloaded every time. (Doesn't change.)
species_distance_matrix = pd.read_csv("output/Actinopterygii_tree_distance_matrix_py.csv", index_col=0)


In [None]:

# Taxonomy info.
genus_order_family_taxon_lookup = pd.read_csv("output/Actinopterygii_genus_order_family_taxon.csv")

# The tree. This has the species as leaves and orders are named internal nodes.
species_tree_with_orders = ete3.Tree("output/Actinopterygii_species_with_order.nwk", format=1)

In [None]:
# Let's verify a few things. 
#  The genus lookup table should match the tree. How many in each?

num_leaves = len(species_tree_with_orders.get_leaves())
print(f"{num_leaves} leaves in the genus tree.")

taxa_in_lookup_table = genus_order_family_taxon_lookup.shape[0]
print(f"{taxa_in_lookup_table} taxa in the lookup table.")

In [None]:
# Make a list of the fish orders. Use the taxa lookup table to get the unique orders.
fish_orders = genus_order_family_taxon_lookup["order"].unique().tolist()
print(f"{len(fish_orders)} unique fish orders in the lookup table.")

## Some basic counts

How many species and families? Which families have the most genera? Which orders have the most families, etc?

In [None]:
# Let's first make a list of all the orders.
all_orders = genus_order_family_taxon_lookup['order'].unique()

# Now for each order, let's make a dictionary where the key is the order, and the value is a set of all families in that order.
# We don't want a list because there are many repeats, of course.
order_to_families = {}

# Iterate through the genus_order_family_taxon_lookup list and sort families into their respective orders
for index, row in genus_order_family_taxon_lookup.iterrows():
    order = row['order']
    family = row['family']
    if order not in order_to_families:
        order_to_families[order] = set()
    order_to_families[order].add(family)

# Let's print it.
for order, families in order_to_families.items():
    print(f"{order:<20}  {len(families):<5}")


In [None]:
# Which orders have the most species?
order_counts = genus_order_family_taxon_lookup['order'].value_counts()
print(order_counts.head(10))


## Random sampling

Let's grab a random genus from every order and make a distance matrix. Then we'll run MDS on this and see what it looks like in a sphere with the tree overlaid on it.

In [None]:
# Use the genus lookup table to grab one taxon from each order at random.
random_taxa = []
for order in all_orders:
    species_in_order = genus_order_family_taxon_lookup[genus_order_family_taxon_lookup['order'] == order]['taxon'].values
    if len(species_in_order) > 0:
        random_taxon = np.random.choice(species_in_order)
        random_taxa.append(random_taxon)
        
random_taxa_distance_matrix = species_distance_matrix.loc[random_taxa, random_taxa]

mds = MDS(n_components=3 , max_iter=4000 , eps = 10**-6, dissimilarity='precomputed', n_jobs=-1, verbose=10)
random_taxa_mds_coords = mds.fit(random_taxa_distance_matrix)
random_taxa_mds_df = pd.DataFrame(random_taxa_mds_coords.embedding_, index=random_taxa, columns=['x', 'y', 'z'])
random_taxa_mds_df.index.name = 'taxon'


In [None]:
# Add a column for the order.
random_taxa_mds_df['order'] = random_taxa_mds_df.index.map(lambda g: genus_order_family_taxon_lookup[genus_order_family_taxon_lookup['taxon'] == g]['order'].values[0])

# Reorder the columns so order is first.
random_taxa_mds_df = random_taxa_mds_df[['order', 'x', 'y', 'z']]

# Save it out as a CSV file so we can run Wandrille's script on it to put in branches.

random_taxa_mds_df.to_csv("output/Actinopterygii_random_taxon_per_order_mds_coords.csv", index=False)

%run ./integrate_tree_to_XYZ/integrate_tree_to_XYZ.py -i output/Actinopterygii_random_taxon_per_order_mds_coords.csv -t "output/Actinopterygii_order_level.nwk" -o "output/Actinopterygii_random_taxon_per_order_mds_coords" --ignore-missing --use-z-from-file

random_taxa_mds_branches = pd.read_csv("output/Actinopterygii_random_taxon_per_order_mds_coords.branches.csv")

Xb = []
Yb = []
Zb = []

for i,row in random_taxa_mds_branches.iterrows():
    Xb += [ row.x0 , row.x1 , None ]
    Yb += [ row.y0 , row.y1 , None ]
    Zb += [ row.z0 , row.z1 , None ]

# Plot it and use the order as the color and label.
fig = px.scatter_3d(random_taxa_mds_df, x='x', y='y', z='z', color='order', text='order')
fig.add_trace(go.Scatter3d(x=Xb, y=Yb, z=Zb, mode='lines'))
fig.update_layout(height=800, width=800)
# Make the backgroud planes invisible.
fig.update_scenes(xaxis_visible=False, yaxis_visible=False, zaxis_visible=False)
fig.update_traces(marker=dict(size=5), textposition='top center')

## Sphere-izing the points

In the end, these points should all be on the surface of a sphere with radius 1.0.

In [None]:
# Sphere-ize the points. We want them all to be on the surface of a sphere with radius 1.0.
radii = np.sqrt(random_taxa_mds_df['x']**2 + random_taxa_mds_df['y']**2 + random_taxa_mds_df['z']**2)
random_taxa_mds_df['x'] = random_taxa_mds_df['x'] / radii
random_taxa_mds_df['y'] = random_taxa_mds_df['y'] / radii
random_taxa_mds_df['z'] = random_taxa_mds_df['z'] / radii

# Save it out as a CSV file so we can run Wandrille's script on it to put in branches.
random_taxa_mds_df.to_csv("output/Actinopterygii_random_taxon_per_order_mds_norm_on_sphere.csv", index=False)

%run ./integrate_tree_to_XYZ/integrate_tree_to_XYZ.py -i output/Actinopterygii_random_taxon_per_order_mds_norm_on_sphere.csv -t "output/Actinopterygii_order_level.nwk" -o "output/Actinopterygii_random_taxon_per_order_mds_norm_on_sphere" --ignore-missing --use-z-from-file

random_taxa_mds_branches = pd.read_csv("output/Actinopterygii_random_taxon_per_order_mds_norm_on_sphere.branches.csv")
Xb = []
Yb = []
Zb = []

for i,row in random_taxa_mds_branches.iterrows():
    Xb += [ row.x0 , row.x1 , None ]
    Yb += [ row.y0 , row.y1 , None ]
    Zb += [ row.z0 , row.z1 , None ]

# Plot it and use the order as the color and label.
fig = px.scatter_3d(random_taxa_mds_df, x='x', y='y', z='z', color='order', text='order')
fig.add_trace(go.Scatter3d(x=Xb, y=Yb, z=Zb, mode='lines'))
fig.update_layout(height=800, width=800)
# Make the backgroud planes invisible.
fig.update_scenes(xaxis_visible=False, yaxis_visible=False, zaxis_visible=False)
fig.update_traces(marker=dict(size=5), textposition='top center')

In [None]:
# Re-do the plot but with dot size proportional to the number of species in each order.
# Map order_counts (number of species per order) to each point in the dataframe.
random_taxa_mds_df['species_count'] = random_taxa_mds_df['order'].map(order_counts)

fig2 = px.scatter_3d(random_taxa_mds_df, x='x', y='y', z='z', color='order', text='order',
                     size='species_count', size_max=40)
fig2.add_trace(go.Scatter3d(x=Xb, y=Yb, z=Zb, mode='lines'))
fig2.update_layout(height=800, width=800)
fig2.update_scenes(xaxis_visible=False, yaxis_visible=False, zaxis_visible=False)
fig2.update_traces(textposition='top center')
fig2.show()

# Extra-relaxing fish

Many fish are very very tense. So we add an extra layer of relaxation here.

OK not really. 

In [None]:
import os
relaxed_order_scaffold_dir = "output/order-scaffold-relaxation"
os.makedirs(relaxed_order_scaffold_dir, exist_ok=True)

scatterplot_cmd = "./spherical-scatterplot-relaxation/spherical_scatterplot_relaxation.py "

# Output filename from above.
scatterplot_cmd += "-i output/Actinopterygii_random_taxon_per_order_mds_norm_on_sphere.csv "
scatterplot_cmd += f"-o {relaxed_order_scaffold_dir}/order-scaffold-relaxed "
scatterplot_cmd += "-n 500 -n 100 -w 5 -l 0.05"

%run $scatterplot_cmd

In [None]:
# Use a slider to visualize the relaxation process.
fig = go.Figure()
rounds_to_plot = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]

# Precompute normalized marker sizes based on species count (consistent across all rounds).
# Scale counts to a pixel size range of 3–20.
_counts = random_taxa_mds_df['order'].map(order_counts)
_min, _max = _counts.min(), _counts.max()
marker_sizes = ((_counts - _min) / (_max - _min) * 17 + 3).values  # shape: (n_orders,)

# Add traces, one for each slider step
for step in range(len(rounds_to_plot)):
    filename = relaxed_order_scaffold_dir
    filename += f"/order-scaffold-relaxed_round{rounds_to_plot[step]}.csv"

    # reading data for that round
    tmp = pd.read_csv( filename )

    fig.add_trace(
        go.Scatter3d(
            x=tmp['x'],
            y=tmp['y'],
            z=tmp['z'],
            mode='markers+text',
            marker=dict(size=marker_sizes, color=tmp['order'].astype('category').cat.codes, colorscale='Viridis', opacity=0.8),
            text=tmp['order'],
            textposition='top center',
            textfont=dict(size=9),
            hovertext=tmp.order,
            name=f'Round {rounds_to_plot[step]}',
            visible=(step == 0)  # Only the first trace is visible initially
        )
    )
# Create slider steps
steps = []
for i in range(len(rounds_to_plot)):
    step = dict(
        method="update",
        args=[{"visible": [False] * len(rounds_to_plot)},
              {"title": f"Order scaffold relaxed - Round {rounds_to_plot[i]}"}],  # layout attribute
    )
    step["args"][0]["visible"][i] = True  # Toggle i'th trace to "visible"
    steps.append(step)
sliders = [dict(
    active=0,
    currentvalue={"prefix": "Round: "},
    pad={"t": 50},
    steps=steps
)]
fig.update_layout(
    sliders=sliders,
    width=800,
    height=800,
    title="Order scaffold relaxed on Sphere"
)
fig.show()

Now we want to run relaxation slightly differently. We want to specify weights for each order so that orders with larger numbers of taxa have more "repulsion", so that there is more room for the points in that order to be grafted onto the sphere.

In [None]:
# Make a new dataframe based on the
# output/random_taxa_mds_coords_norm_on_sphere.csv file, but with an additional
# "weight" column tghat is the number of species in each order. This will be
# used to weight the points in the relaxation process so that orders with more
# species have more influence on the final layout.

random_taxa_mds_df_with_weights = random_taxa_mds_df.copy()
random_taxa_mds_df_with_weights['weight'] = random_taxa_mds_df_with_weights['order'].map(order_counts)

# Save it out as a CSV file so we can run Wandrille's script on it to put in branches.
random_taxa_mds_df_with_weights.to_csv("output/random_taxa_mds_coords_norm_on_sphere_with_weights.csv", index=False)

relaxed_order_scaffold_weighted_dir = "output/order-scaffold-relaxation-weighted"
os.makedirs(relaxed_order_scaffold_weighted_dir, exist_ok=True)

scatterplot_cmd = "./spherical-scatterplot-relaxation/spherical_scatterplot_relaxation.py "
scatterplot_cmd += "-i output/random_taxa_mds_coords_norm_on_sphere_with_weights.csv "
scatterplot_cmd += f"-o {relaxed_order_scaffold_weighted_dir}/order-scaffold-weighted-relaxed "
scatterplot_cmd += "-n 500 -n 100 -w 5 -l 0.05 --weight-column weight"

%run $scatterplot_cmd


In [None]:
# Now let's plot these for comparison.

# Use a slider to visualize the relaxation process.
fig = go.Figure()
rounds_to_plot = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]

# Precompute normalized marker sizes based on species count (consistent across all rounds).
# Scale counts to a pixel size range of 3–20.
_counts = random_taxa_mds_df['order'].map(order_counts)
_min, _max = _counts.min(), _counts.max()
marker_sizes = ((_counts - _min) / (_max - _min) * 17 + 3).values  # shape: (n_orders,)

# Add traces, one for each slider step
for step in range(len(rounds_to_plot)):
    filename = relaxed_order_scaffold_weighted_dir
    filename += f"/order-scaffold-weighted-relaxed_round{rounds_to_plot[step]}.csv"

    # reading data for that round
    tmp = pd.read_csv( filename )

    fig.add_trace(
        go.Scatter3d(
            x=tmp['x'],
            y=tmp['y'],
            z=tmp['z'],
            mode='markers+text',
            marker=dict(size=marker_sizes, color=tmp['order'].astype('category').cat.codes, colorscale='Viridis', opacity=0.8),
            text=tmp['order'],
            textposition='top center',
            textfont=dict(size=9),
            hovertext=tmp.order,
            name=f'Round {rounds_to_plot[step]}',
            visible=(step == 0)  # Only the first trace is visible initially
        )
    )
# Create slider steps
steps = []
for i in range(len(rounds_to_plot)):
    step = dict(
        method="update",
        args=[{"visible": [False] * len(rounds_to_plot)},
              {"title": f"Order scaffold relaxed weighted - Round {rounds_to_plot[i]}"}],  # layout attribute
    )
    step["args"][0]["visible"][i] = True  # Toggle i'th trace to "visible"
    steps.append(step)
sliders = [dict(
    active=0,
    currentvalue={"prefix": "Round: "},
    pad={"t": 50},
    steps=steps
)]
fig.update_layout(
    sliders=sliders,
    width=800,
    height=800,
    title="Order scaffold relaxed weighted on Sphere"
)
fig.show()