In [None]:
import json
import pathlib
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display, SVG
from lxml import etree
from lxml.builder import E

sys.path.append(str(pathlib.Path().resolve().parents[1]))
from src.util.db_utils import SynFermDatabaseConnection
from src.definitions import DATA_DIR

In [None]:
con = SynFermDatabaseConnection()  # we use this to conveniently get the paper names, not for data

In [None]:
# import data from curated data set
data_version = "2024-04-18_38586records"
df = pd.read_csv(DATA_DIR / "curated_data" / f"synferm_dataset_{data_version}.csv")
len(df)

In [None]:
df.head()

In [None]:
df["major_A-C"].value_counts().sort_index()

In [None]:
def get_class(series):
    sorted_series = series.sort_values()  # sort so that ties will be broken in a consistent way
    most_frequent_value = sorted_series.value_counts().index[0]
    if most_frequent_value == "A":
        return "major-A"
    if most_frequent_value == "B":
        return "major-B"
    if most_frequent_value == "C":
        return "major-C"
    if most_frequent_value == "no_product":
        return "major-none"

In [None]:
x_axis = "Monomer"
y_axis = "Initiator"

In [None]:
x_index = np.sort(df[f"{x_axis[0]}_long"].unique())
y_index = np.sort(df[f"{y_axis[0]}_long"].unique())

In [None]:
# some settings for the plot
rect_x = 20
rect_y = 20
text_len = 30

x_total = len(x_index) * rect_x + text_len
y_total = len(y_index) * rect_y + text_len

# set root
svg = E.svg(viewBox=f"0 0 {x_total} {y_total}")

# set internal CSS styles
style = E.style("""
.major-A {
  fill: #5790fc;
  stroke: #729ef5;
}
.major-B {
  fill: #f89c20;
  stroke: #f6b45e;
}
.major-C {
  fill: #adad7d;
  stroke: #aaaaaa;
}
.major-none {
  fill: #e42536;
  stroke: #de4e5b;
}
.axes {
  stroke: black; 
}
.axes-labels {
  fill: black;
  font-family: Helvetica, sans-serif;
  font-size: 12px;
  font-weight: bold;
}
""")
svg.append(style)

# set axes
#axes = E.g()
#axes.set("class", "axes")
#axes.append(E.line(x1=str(text_len), x2=str(x_total), y1=str(y_total - text_len), y2=str(y_total - text_len)))  # x-axis
#axes.append(E.line(x1=str(text_len), x2=str(text_len), y1="0", y2=str(y_total - text_len)))  # y-axis
#svg.append(axes)

# set axes labels
axes_labels = E.g()
axes_labels.set("class", "axes-labels")
x_labels = E.text(transform=f"translate({text_len + ((rect_x - 12) / 2)} {2 * y_total}) rotate(-90)")
y_labels = E.text(x="0", y=str(rect_y), transform=f"translate(0 {(rect_y - 12) / 2})")  # translate to have the label in the middle of the box, not end

# x-axis labels
for i_x, x_label in enumerate(x_index):
    # n.b. the labels use the "paper names" from the 50k publication
    tspan = E.tspan(con.get_paper_name(long=x_label), y=str((i_x + 0.5) * rect_x), x=str(y_total))
    x_labels.append(tspan)

# y-axis labels
for i_y, y_label in enumerate(y_index):
    # n.b. the labels use the "paper names" from the 50k publication
    tspan = E.tspan(con.get_paper_name(long=y_label), x="0", y=str((i_y + 0.5) * rect_y))
    y_labels.append(tspan)

# Append the tspan elements
axes_labels.append(x_labels)
axes_labels.append(y_labels)
svg.append(axes_labels)

# set heatmap squares
squares = E.g()
squares.set("class", "heatmap-squares")

for i_x, x in enumerate(x_index):
    for i_y, y in enumerate(y_index):
        data = df.loc[(df[f"{x_axis[0]}_long"] == x) & (df[f"{y_axis[0]}_long"] == y)]
        if len(data) > 0:
            offset_x = rect_x * i_x + text_len
            offset_y = rect_y * i_y
            square = E.rect(x=str(offset_x), y=str(offset_y), width=str(rect_x), height=str(rect_y), id=f"{x}_{y}")
            square.set("class", get_class(data["major_A-C"]))
            squares.append(square)
svg.append(squares)

# render to string
s = etree.tostring(svg, pretty_print=True)

In [None]:
display(SVG(s))

In [None]:
with open(f"/Users/julian/Desktop/sf_heatmap_{data_version.split('_')[0]}_majorProduct_paper-names.svg", "wb") as f:
    f.write(etree.tostring(svg, pretty_print=True))

In [None]:
# we will need a JSON dict to relate identifiers to data points / SVG images
relations = {}
for i_x, x in enumerate(x_index):
    for i_y, y in enumerate(y_index):
        identifier = f"{x}_{y}"
        data = df.loc[(df[f"{x_axis[0]}_long"] == x) & (df[f"{y_axis[0]}_long"] == y)]
        if len(data) > 0:
            filenames = [f"product_images/{row['I_long']}_{row['M_long']}_{row['T_long']}.svg" for i, row in data.iterrows()]
            major = [row['major_A-C'] for i, row in data.iterrows()]
            terminators = [con.get_paper_name(long=row['T_long']) for i, row in data.iterrows()]
        else:
            filenames = []
            major = {"A": 0, "B": 0, "C": 0, "none": 0}
            terminators = []
        relations[identifier] = {"filenames": filenames, "major": major, "terminator_paper_names": terminators}
            

In [None]:
print(relations)

In [None]:
with open("/Users/julian/Desktop/svg-relations.json", "w") as f:
    json.dump(relations, f)

In [None]:
# now we produce the same heatmap, but for binary_A instead of major_A-C
def get_class(series):
    most_frequent_value = series.value_counts().index[0]
    if most_frequent_value == 1:
        return "success"
    if most_frequent_value == 0:
        return "fail"

In [None]:
x_axis = "Monomer"
y_axis = "Initiator"

x_index = np.sort(df[f"{x_axis[0]}_long"].unique())
y_index = np.sort(df[f"{y_axis[0]}_long"].unique())

# some settings for the plot
rect_x = 20
rect_y = 20
text_len = 35

x_total = len(x_index) * rect_x + text_len
y_total = len(y_index) * rect_y + text_len

# set root
svg = E.svg(viewBox=f"0 0 {x_total} {y_total}")

# set internal CSS styles
style = E.style("""
.success {
  fill: #5790fc;
  stroke: #729ef5;
}
.fail {
  fill: #e42536;
  stroke: #de4e5b;
}

.axes-labels {
  fill: black;
  font-family: Helvetica, sans-serif;
  font-size: 16px;
  font-weight: normal;
}
""")
svg.append(style)



# set axes labels
axes_labels = E.g()
axes_labels.set("class", "axes-labels")
x_labels = E.text(transform=f"translate({text_len + ((rect_x - 12) / 2)} {2 * y_total}) rotate(-90)")
y_labels = E.text(x="0", y=str(rect_y), transform=f"translate(0 {(rect_y - 12) / 2})")  # translate to have the label in the middle of the box, not end

# x-axis labels
for i_x, x_label in enumerate(x_index):
    # n.b. the labels use the "paper names" from the 50k publication
    tspan = E.tspan(con.get_paper_name(long=x_label), y=str((i_x + 0.5) * rect_x), x=str(y_total))
    x_labels.append(tspan)

# y-axis labels
for i_y, y_label in enumerate(y_index):
    # n.b. the labels use the "paper names" from the 50k publication
    tspan = E.tspan(con.get_paper_name(long=y_label), x="0", y=str((i_y + 0.5) * rect_y))
    y_labels.append(tspan)

# Append the tspan elements
axes_labels.append(x_labels)
axes_labels.append(y_labels)
svg.append(axes_labels)

# set heatmap squares
squares = E.g()

for i_x, x in enumerate(x_index):
    for i_y, y in enumerate(y_index):
        data = df.loc[(df[f"{x_axis[0]}_long"] == x) & (df[f"{y_axis[0]}_long"] == y)]
        if len(data) > 0:
            offset_x = rect_x * i_x + text_len
            offset_y = rect_y * i_y
            square = E.rect(x=str(offset_x), y=str(offset_y), width=str(rect_x), height=str(rect_y), id=f"{x}_{y}")
            square.set("class", get_class(data["binary_A"]))
            squares.append(square)
svg.append(squares)

# render to string
s = etree.tostring(svg, pretty_print=True)

In [None]:
display(SVG(s))

In [None]:
with open(f"/Users/julian/Desktop/sf_heatmap_{data_version.split('_')[0]}_binary-A_paper-names.svg", "wb") as f:
    f.write(etree.tostring(svg, pretty_print=True))

In [None]:
# same for B

x_axis = "Monomer"
y_axis = "Initiator"

x_index = np.sort(df[f"{x_axis[0]}_long"].unique())
y_index = np.sort(df[f"{y_axis[0]}_long"].unique())

# some settings for the plot
rect_x = 20
rect_y = 20
text_len = 35

x_total = len(x_index) * rect_x + text_len
y_total = len(y_index) * rect_y + text_len

# set root
svg = E.svg(viewBox=f"0 0 {x_total} {y_total}")

# set internal CSS styles
style = E.style("""
.success {
  fill: #f89c20;
  stroke: #f6b45e;
}
.fail {
  fill: #e42536;
  stroke: #de4e5b;
}

.axes-labels {
  fill: black;
  font-family: Helvetica, sans-serif;
  font-size: 16px;
  font-weight: normal;
}
""")
svg.append(style)



# set axes labels
axes_labels = E.g()
axes_labels.set("class", "axes-labels")
x_labels = E.text(transform=f"translate({text_len + ((rect_x - 12) / 2)} {2 * y_total}) rotate(-90)")
y_labels = E.text(x="0", y=str(rect_y), transform=f"translate(0 {(rect_y - 12) / 2})")  # translate to have the label in the middle of the box, not end

# x-axis labels
for i_x, x_label in enumerate(x_index):
    # n.b. the labels use the "paper names" from the 50k publication
    tspan = E.tspan(con.get_paper_name(long=x_label), y=str((i_x + 0.5) * rect_x), x=str(y_total))
    x_labels.append(tspan)

# y-axis labels
for i_y, y_label in enumerate(y_index):
    # n.b. the labels use the "paper names" from the 50k publication
    tspan = E.tspan(con.get_paper_name(long=y_label), x="0", y=str((i_y + 0.5) * rect_y))
    y_labels.append(tspan)

# Append the tspan elements
axes_labels.append(x_labels)
axes_labels.append(y_labels)
svg.append(axes_labels)

# set heatmap squares
squares = E.g()

for i_x, x in enumerate(x_index):
    for i_y, y in enumerate(y_index):
        data = df.loc[(df[f"{x_axis[0]}_long"] == x) & (df[f"{y_axis[0]}_long"] == y)]
        if len(data) > 0:
            offset_x = rect_x * i_x + text_len
            offset_y = rect_y * i_y
            square = E.rect(x=str(offset_x), y=str(offset_y), width=str(rect_x), height=str(rect_y), id=f"{x}_{y}")
            square.set("class", get_class(data["binary_B"]))
            squares.append(square)
svg.append(squares)

# render to string
s = etree.tostring(svg, pretty_print=True)


with open(f"/Users/julian/Desktop/sf_heatmap_{data_version.split('_')[0]}_binary-B_paper-names.svg", "wb") as f:
    f.write(etree.tostring(svg, pretty_print=True))

In [None]:
# same for C

x_axis = "Monomer"
y_axis = "Initiator"

x_index = np.sort(df[f"{x_axis[0]}_long"].unique())
y_index = np.sort(df[f"{y_axis[0]}_long"].unique())

# some settings for the plot
rect_x = 20
rect_y = 20
text_len = 35

x_total = len(x_index) * rect_x + text_len
y_total = len(y_index) * rect_y + text_len

# set root
svg = E.svg(viewBox=f"0 0 {x_total} {y_total}")

# set internal CSS styles
style = E.style("""
.success {
  fill: #adad7d;
  stroke: #aaaaaa;
}
.fail {
  fill: #e42536;
  stroke: #de4e5b;
}

.axes-labels {
  fill: black;
  font-family: Helvetica, sans-serif;
  font-size: 16px;
  font-weight: normal;
}
""")
svg.append(style)



# set axes labels
axes_labels = E.g()
axes_labels.set("class", "axes-labels")
x_labels = E.text(transform=f"translate({text_len + ((rect_x - 12) / 2)} {2 * y_total}) rotate(-90)")
y_labels = E.text(x="0", y=str(rect_y), transform=f"translate(0 {(rect_y - 12) / 2})")  # translate to have the label in the middle of the box, not end

# x-axis labels
for i_x, x_label in enumerate(x_index):
    # n.b. the labels use the "paper names" from the 50k publication
    tspan = E.tspan(con.get_paper_name(long=x_label), y=str((i_x + 0.5) * rect_x), x=str(y_total))
    x_labels.append(tspan)

# y-axis labels
for i_y, y_label in enumerate(y_index):
    # n.b. the labels use the "paper names" from the 50k publication
    tspan = E.tspan(con.get_paper_name(long=y_label), x="0", y=str((i_y + 0.5) * rect_y))
    y_labels.append(tspan)

# Append the tspan elements
axes_labels.append(x_labels)
axes_labels.append(y_labels)
svg.append(axes_labels)

# set heatmap squares
squares = E.g()

for i_x, x in enumerate(x_index):
    for i_y, y in enumerate(y_index):
        data = df.loc[(df[f"{x_axis[0]}_long"] == x) & (df[f"{y_axis[0]}_long"] == y)]
        if len(data) > 0:
            offset_x = rect_x * i_x + text_len
            offset_y = rect_y * i_y
            square = E.rect(x=str(offset_x), y=str(offset_y), width=str(rect_x), height=str(rect_y), id=f"{x}_{y}")
            square.set("class", get_class(data["binary_C"]))
            squares.append(square)
svg.append(squares)

# render to string
s = etree.tostring(svg, pretty_print=True)


with open(f"/Users/julian/Desktop/sf_heatmap_{data_version.split('_')[0]}_binary-C_paper-names.svg", "wb") as f:
    f.write(etree.tostring(svg, pretty_print=True))

In [None]:
# while we are at it, get the statistics for success rates by product
df[[i for i in df.columns if i.startswith("binary_")]].mean()

In [None]:
# while we are at it, get the statistics for success rates by product
df["binary_A"].value_counts().plot.pie(figsize=(1,1), colors=["#5790fc", "#e42536"], startangle=90, counterclock=False)
plt.savefig("/Users/julian/Desktop/binary-A_freq.svg")

In [None]:
# while we are at it, get the statistics for success rates by product
df["binary_B"].value_counts().plot.pie(figsize=(1,1), colors=["#f89c20", "#e42536"], startangle=90, counterclock=False)
plt.savefig("/Users/julian/Desktop/binary-B_freq.svg")

In [None]:
# while we are at it, get the statistics for success rates by product
df["binary_C"].value_counts().sort_index(ascending=False).plot.pie(figsize=(1,1), colors=["#adad7d", "#e42536"], startangle=90, counterclock=False)
plt.savefig("/Users/julian/Desktop/binary-C_freq.svg")

In [None]:
# also get the statistics on the main product
df["major_A-C"].value_counts() / len(df)

In [None]:
(df["major_A-C"].value_counts().sort_index() / len(df)).plot.pie(figsize=(1,1), colors=["#5790fc", "#f89c20", "#adad7d", "#e42536"], startangle=90, counterclock=False)
plt.savefig("/Users/julian/Desktop/major_A-C_freq.svg")