In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

import kaleido
import plotly.io as pio
pio.kaleido.scope.default_format = "png"

## Sankey plots of object fate between learn2therm and ValidProt

Data were generated by observing tables at different stages of pruning and were recorded March 3, 2023 in these tables. Data may be subsequently updated to reflect the state of the project if different parameters are selected.

In [2]:
sank_blue = 'rgb(32, 159, 223)'
sank_red = 'rgb(204, 102, 119)'
sank_grey = 'rgb(221, 221, 221)'
sank_purple = 'rgb(118, 111, 159)'
sank_blue_t = 'rgba(32, 159, 223, 0.5)'
sank_red_t = 'rgba(204, 102, 119, 0.5)'
sank_grey_t = 'rgba(221, 221, 221, 0.5)'
sank_purple_t = 'rgba(118, 111, 159, 0.5)'

In [4]:
fig = go.Figure(data=[go.Sankey(
    arrangement = 'snap',
    node = dict(
      pad = 25,
      thickness = 20,
      line = dict(color = 'black', width = 0.5),
      label = ['learn2therm (750k)', '16S pair', 'No pair (98.2%)', 'ValidProt (4.4k, 0.6%)', '< 20 \N{DEGREE SIGN}C diff (1.3%)'],
      color = [sank_purple, sank_purple, sank_grey, sank_purple, sank_grey]
    ),
    link = dict(
      source = [1, 0, 0, 1],
      target = [4, 1, 2, 3],
      value = [9350, 13784, 736849, 4434],
      color = [sank_grey_t, sank_purple_t, sank_grey_t, sank_purple_t]
  ))])

fig.update_layout(title_text='Taxa Pairs', font_family = 'Arial', font_size=16)
fig.show()

#fig.write_image('/mnt/c/Users/Ryan/Desktop/ValidProt/taxa_pair_sankey.png', engine = 'kaleido', scale = 6, width = 1280, height = 640)

__Taxa Pairs__

Total learn2therm: 750,633 <br/>
Total ValidProt: 4,433 (0.59%)


| Origin      | Destination | Count  | Percent |
|-------------|-------------|--------|---------|
| learn2therm | No pair    | 736849 | 98.16   |
| learn2therm | 16S pair   | 13784  | 1.84    |
| 16S pairs   | < 20 C diff | 9350   | 1.25    |
| 16S pairs   | ValidProt   | 4434   | 0.59    |

In [7]:
fig = go.Figure(data=[go.Sankey(
    arrangement = 'snap',
    node = dict(
      pad = 20,
      thickness = 20,
      line = dict(color = 'black', width = 0.5),
      label = ['learn2therm (4.4k)', 'Thermophile', 'Mesophile', '16S pair', '16S pair', 'No pairs (41.7%)',
               '< 20 \N{DEGREE SIGN}C diff (18.2%)', 'ValidProt (1.8k, 40.1%)'],
      color = [sank_purple, sank_red, sank_blue, sank_red, sank_blue, sank_grey, sank_grey, sank_purple]
    ),
    link = dict(
      source = [2, 4, 0, 0, 1, 1, 2, 3, 3, 4],
      target = [5, 6, 1, 2, 3, 5, 4, 6, 7, 7],
      value = [1664, 725, 285, 4062, 135, 150, 2398, 67, 68, 1673],
      color = [sank_grey_t, sank_grey_t, sank_red_t, sank_blue_t, sank_red_t, sank_grey_t, sank_blue_t, sank_grey_t, 
               sank_red_t, sank_blue_t]
  ))])

fig.update_layout(title_text="Taxa Representation", font_size=16, font_family = 'Arial', font_color = 'black')
fig.show()

#fig.write_image('/mnt/c/Users/Ryan/Desktop/ValidProt/taxa_sankey.png', engine = 'kaleido', scale = 6, width = 1280, height = 640)

__Taxa__

Total learn2therm: 4,347 </br>
Total ValidProt: 1,741 (40.05%)


| Origin      | Destination | Count  | Percent |
|-------------|-------------|--------|---------|
| learn2therm | Mesophile   | 4062   | 93.44   |
| learn2therm | Thermophile | 285    | 6.56    |
| Mesophile   | No pairs    | 1664   | 38.28   |
| Mesophile   | 16S pair    | 2398   | 55.16   |
| Thermophile | No pairs    | 150    | 3.45    |
| Thermophile | 16S pair    | 135    | 3.11    |
| 16S pair (m)| < 20 C diff | 725    | 16.68   |
| 16S pair (m)| ValidProt   | 1673   | 38.49   |
| 16S pair (t)| < 20 C diff | 67     | 1.54    |
| 16S pair (t)| ValidProt   | 68     | 1.56    |

In [6]:
fig = go.Figure(data=[go.Sankey(
    arrangement = 'snap',
    node = dict(
      pad = 25,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = ['learn2therm (181m)', '< 20 \N{DEGREE SIGN}C diff (71.2%)', 'ValidProt (54m, 29.4%)'],
      color = [sank_purple, sank_grey, sank_purple]
    ),
    link = dict(
      source = [0, 0],
      target = [1, 2],
      value = [129198093, 53302409],
      color = [sank_grey_t, sank_purple_t]
  ))])

fig.update_layout(title_text='Protein Pairs', font_family = 'Arial', font_size=16)
fig.show()

#fig.write_image('/mnt/c/Users/Ryan/Desktop/ValidProt/protein_pair_sankey.png', engine = 'kaleido', scale = 6, width = 1280, height = 640)

__Protein Pairs__

Total learn2therm: 181,500,502 <br/>
Total ValidProt: 53,302,409 (29.37%)


| Origin      | Destination | Count  | Percent |
|-------------|-------------|--------|---------|
| learn2therm | < 20 C diff | 128198093 | 70.63   |
| learn2therm | ValidProt   | 53302409 | 29.37    |

In [5]:
fig = go.Figure(data=[go.Sankey(
    arrangement = 'snap',
    node = dict(
      pad = 50,
      thickness = 20,
      line = dict(color = 'black', width = 0.5),
      label = ['learn2therm (65m)', 'Thermophile', 'Mesophile', '16S pair', '16S pair', 'No pairs (11.5%)',
               '< 20 \N{DEGREE SIGN}C diff (10.4%)', 'ValidProt (4.3m, 6.5%)', 'Null (71.6%)'],
      color = [sank_purple, sank_red, sank_blue, sank_red, sank_blue, sank_grey, sank_grey, sank_purple, sank_grey]
    ),
    link = dict(
      source = [0, 0, 0, 1, 1, 2, 4, 4, 3, 3, 2],
      target = [8, 1, 2, 3, 5, 5, 6, 7, 6, 7, 4],
      value = [46828448, 1026299, 17572725, 446860, 579439, 6969342, 6494643, 4108740, 293165, 153695, 10603383],
      color = [sank_grey_t, sank_red_t, sank_blue_t, sank_red_t, sank_grey_t, sank_grey_t, sank_grey_t, sank_blue_t, 
               sank_grey_t, sank_red_t, sank_blue_t]
  ))])

fig.update_layout(title_text="Protein Representation", font_family = 'Arial', font_size=16)
fig.show()

#fig.write_image('/mnt/c/Users/Ryan/Desktop/ValidProt/protein_sankey.png', engine = 'kaleido', scale = 6, width = 1280, height = 640)

__Proteins__

Total learn2therm: 65,427,472 </br>
Total ValidProt: 4,262,435 (6.51%)


| Origin      | Destination | Count  | Percent |
|-------------|-------------|--------|---------|
| learn2therm | Null        | 46828448 | 71.57 |
| learn2therm | Mesophile   | 17572725 | 15.69 |
| learn2therm | Thermophile | 1026299 | 1.57 |
| Mesophile   | 16S pair (m) | 10603383 | 16.21 |
| Mesophile   | No pair     | 6969342 | 10.65  |
| Thermophile | 16S pair (t) | 446860 | 0.68   |
| Thermophile | No pair     | 579439 | 0.89    |
| 16S pair (m)| ValidProt   | 4108740 | 6.28   |
| 16S pair (m)| < 20 C diff | 6494643 | 9.93   |
| 16S pair (t)| ValidProt   | 153695 | 0.23    |
| 16S pair (t)| < 20 C diff | 293165 | 0.45    |