In [1]:
import os
from sklearn import preprocessing
import pandas as pd
import numpy as np
import torch
from botorch.utils.multi_objective import pareto
import numpy as np
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


### 1. Setting up the MRL scaler  
The Optimus5Prime provided in the directory is trained using normalized MRL values, and therefore predict normalized values. The scaler loaded here will transform the noramlized values back to the actual MRL values.

In [2]:
label_data = pd.read_csv(os.path.join("./data/GSM3130435_egfp_unmod_1_processed.tsv"), sep="\t")["rl"]
scaler = preprocessing.StandardScaler().fit(np.array(label_data).reshape(-1,1))

In [None]:
def convert_obj_to_real(df_tmp, column_name, obj_name):
    if obj_name == "AGC content [%]":
        obj_val = np.array(df_tmp[column_name])
        seq_length = len(df_tmp["cand_seq"].iloc[0])
        obj_val = (seq_length - obj_val) / seq_length * 100
    elif obj_name == "MRL":
        obj_val = np.array(df_tmp[column_name])
        obj_val = scaler.inverse_transform(-obj_val.reshape(-1,1)).flatten()
    elif obj_name == "G4 score":
        obj_val = - np.array(df_tmp[column_name])
    elif obj_name == "in vitro stability":
        obj_val = - np.array(df_tmp[column_name])
    return obj_val

def format_to_rna(tmp_array):
    for i, seq in enumerate(tmp_array):
        tmp_array[i] = seq.replace("T", "U")
    return tmp_array
    
def format_init(df_init):
    df_init = df_init.rename(
        columns={
            "sequence":"cand_seq",
            "nonU_cotent":"obj_val_0",
            "mrl":"obj_val_1",
            "G4":"obj_val_2",
            "degradation":"obj_val_3"
        }
    )
    df_init["round_idx"] = [0,]*len(df_init)
    df_init["cand_uuid"] = ["",]*len(df_init)
    df_init["cand_ancestor"] = ["",]*len(df_init)
    # df_init = df_init.reindex(columns=["round_idx","cand_uuid","cand_ancestor","cand_seq","obj_val_0","obj_val_1","obj_val_2","obj_val_3"])
    return df_init

# assume all objectives are to be maximized
def pareto_ranking(df_all, df_pareto, obj_names):
    rank_scores = []
    for i in range(len(df_pareto)):
        df_tmp = df_all.copy()
        for tmp_obj in obj_names:
            df_tmp = df_tmp[df_tmp[tmp_obj]<=df_pareto[tmp_obj].iloc[i]]
        rank_scores.append(len(df_tmp)+1)
    df_pareto["score_pareto_ranking"] = rank_scores
    df_pareto = df_pareto.sort_values(by="score_pareto_ranking", ascending=False)
    return df_pareto

### 2. Retrieving generated sequences  
Retrieving initial sequences and generated sequences.  

In [36]:
df = pd.read_csv("../cache_dir/baseline_ga2.csv")
df = df.sort_values(by="mrl", ascending=True).iloc[:20]
df.to_csv("../cache_dir/baseline_ga.csv",index=False)
df

Unnamed: 0,sequence,nonU_cotent,mrl,G4,degradation
32,TTGGCATCCGGATTATACACGTGATAAGAGTGTCGAGCATAAAGGG...,11.0,-1.371964,0.2,0.432499
274,TCGGATTCCAGAGAATCAATCCTTTATACAATCACACCATAAAGAG...,12.0,-1.179885,0.3,0.43059
508,TTGGACAGAAAGTGTAGAGAACTTTGGAGATCAATCCGATTGAAGT...,12.0,-1.131379,0.2,0.430435
385,GCAATACAAGAGTATCGAATCGTATCGTTAACGAGTCAGTTATAGG...,12.0,-1.114886,0.6,0.433652
286,GAATCAGAGGATACAATATCTATACTCACTGCGGAGTTTCATTAGG...,13.0,-1.102774,0.2,0.432019
371,AAATAACTGTCGAGTAATCTTAGAAGTCTTAAACGCGCAATACAAG...,11.0,-1.096702,0.5,0.432358
499,CCCCTGTTTCCGAATTAGTAGTTACTTGCAAAAACTGCGTAACTAA...,14.0,-1.093571,0.4,0.433339
353,AGGAATTCGAGAGAACAAGAACGCTTGTATCACAAGTCTCTTATCT...,12.0,-1.086166,0.4,0.428559
230,CCGGAGAAAAAAAGAAATATACTTACCTATCCGCAGCGGAATCCAT...,10.0,-1.085339,0.5,0.428122
467,CCCGATTCTTAGCTTCCGCGATCACTAGAGTCAAGTGTCATAACTT...,16.0,-1.065736,0.6,0.43023


In [38]:
dict_df = {
    "init":None,
    "MRL+GA":None,
    "MRLdesign2024":None,
}

dict_file = {
    "init":"Sample2019_unmod1_initset3_numsample512_ConstructSample2019EGFP.csv",
    "MRL+GA":"baseline_ga.csv",
    "MRLdesign2024":"baseline_CastilloHair2024.csv",
}

obj_columns = ["obj_val_0","obj_val_1","obj_val_2","obj_val_3"]
obj_names = ["AGC content [%]", "MRL", "G4 score", "in vitro stability"]

for name, filename in dict_file.items():
    df_tmp = pd.read_csv(os.path.join("../cache_dir", filename))
    df_save = pd.DataFrame([], columns=obj_names)
    new_points = None
    df_tmp = format_init(df_tmp)

    for obj_column, obj_name in zip(obj_columns, obj_names):
        tmp_obj = convert_obj_to_real(df_tmp, obj_column, obj_name)
        if new_points is None:
            new_points = np.expand_dims(tmp_obj, -1)
        else:
            new_points = np.concatenate([new_points, np.expand_dims(tmp_obj, -1)], -1)
    df_save["sequence"] = format_to_rna(np.array(df_tmp["cand_seq"]))
    df_save[obj_names] = new_points

    dict_df[name] = df_save
    
dict_df["MRLdesign2024"]

Unnamed: 0,AGC content [%],MRL,G4 score,in vitro stability,sequence
0,86.0,7.387938,-0.5,-0.426446,CAAGAGUGCAAGACACGCUCAGAAGUUAACAAAGACUUAGCGACAC...
1,84.0,7.193526,-0.7,-0.427893,CGAGCCGGAAACGGUACUCUAAUUGGCAUACACACCUAGCGCAAAC...
2,74.0,7.152072,-0.7,-0.427261,ACGCAAACUUUGUCGUGCCUUAGGUAGAGGGUGACUACUUGCGGAU...
3,78.0,7.402587,-0.3,-0.433295,CCCACGGUGAUAGUGCGAGUAGACAGAGGGACUUUUAGGCACAUUG...
4,68.0,8.539067,-0.2,-0.432248,CCGGAUAUCCGGAUUUAAUAGAUAGAAGAUAUAAUAAGAUAAUUAU...
5,66.0,8.389125,-0.4,-0.431302,CCGUAAUCCGUUUUAUAGUGAGAGAGUAGUAAUUUAAGUAGAGAAU...
6,72.0,8.215665,-0.3,-0.431963,ACAGAGAGUUUUCUAUAACGUAAAUCCGUAGCUAAGUAAGUAGAAG...
7,72.0,8.193129,-0.7,-0.432721,CCCGUAACUUAAUAAUAUCCGAGAUUAGUAGCUAAUUUAGCGAGUA...
8,76.0,8.145949,-0.3,-0.430623,CCCAGAGAAUAUAAAUCAUAGUAUAGUAAUCCAAGAGAGUAGAAGA...
9,78.0,8.253321,-0.3,-0.432463,GAGAGAGAGUUUAUACCACAGAGUAACUUAACCCAAUAAUAACUAG...


In [40]:
dict_colors = {"MRLdesign2024":np.array([129,94,161])/255,
               "MRL+GA":np.array([129,94,161])/255,
               }
top_N = 20
SAVEFIG=True

# for baseline in ["MRLdesign2024"]:
for baseline in ["MRLdesign2024", "MRL+GA"]:
    tmp_color = dict_colors[baseline]
    fig = plt.figure(figsize=(16,8),
                            facecolor=[0,0,0,0])
    ax = fig.subplots(nrows=1, ncols=1)

    obj_names = ["AGC content [%]", "MRL", "G4 score", "in vitro stability"]
    plt.rcParams["axes.grid"] = False
    ax.set_facecolor([1,1,1])
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)
    ax.spines["bottom"].set_visible(False)
    ax.yaxis.set_visible(False)

    x_offset = 0.05
    obj_labels = ["%AGC", "MRL", "G4", "Stablity"]
    ax.set_xlim([0, 1])
    x_vis = np.linspace(x_offset, 1-x_offset, len(obj_labels))
    ax.set_xticks(x_vis)
    ax.set_xticklabels(obj_labels, fontsize=18)
    ax.tick_params(axis="x", which="major", pad=10)

    y_mins = np.array([60, 0.0, -1.0, -0.44])
    y_maxs = np.array([100, 9.0, 0, -0.41])
    x_distance = (1 - 2*x_offset)/(len(obj_names)-1)
    y_ranges = y_maxs - y_mins
    y_pad = 0.02
    y_mins_padded = y_mins - y_ranges * y_pad
    y_maxs_padded = y_maxs + y_ranges * y_pad
    for i, obj_name in enumerate(obj_names):
        ymin = y_mins_padded[i]
        ymax = y_maxs_padded[i]
        new_twin = ax.twinx()
        new_twin.spines.right.set_position(("axes", x_offset + x_distance*i))
        new_twin.set_ylim([ymin, ymax])
        new_twin.spines["right"].set_color("black")
        new_twin.tick_params(direction="inout", length=10, labelsize=14)
        new_twin.tick_params(axis="y", pad=5)
        new_twin.spines["left"].set_visible(False)
        if i==0:
            y_ticks = np.arange(y_mins[i], y_maxs[i]+1, 10)
            new_twin.set_yticks(y_ticks, labels=[str(y_mins[i])] + [""]*(len(y_ticks)-2) + [str(y_maxs[i])])
        elif i==1:
            y_ticks = np.arange(y_mins[i], y_maxs[i]+0.1, 1.0)
            new_twin.set_yticks(y_ticks, labels=[str(y_mins[i])] + [""]*(len(y_ticks)-2) + [str(y_maxs[i])])
        elif i==2:
            y_ticks = np.arange(y_mins[i], y_maxs[i]+0.01, 0.1)
            new_twin.set_yticks(y_ticks, labels=["$\minus$1.0",] + [""]*(len(y_ticks)-2) + ["0.0"])
        elif i==3:
            y_ticks = np.arange(y_mins[i], y_maxs[i]+0.001, 0.01)
            new_twin.set_yticks(y_ticks, labels=["$\minus$0.44",] + [""]*(len(y_ticks)-2) + ["$\minus$0.41"])

    y_ranges_padded = y_maxs_padded - y_mins_padded
    df_vis = dict_df["init"].drop_duplicates("sequence")
    for i in range(len(df_vis)):
        y_vis = df_vis[obj_names].iloc[i]
        y_vis = (y_vis - y_mins) / y_ranges
        ax.plot(x_vis, y_vis, linestyle="-", color="gray", alpha=0.1)

    ax.set_ylim([-y_pad, 1+y_pad])


    df_vis = dict_df[baseline].drop_duplicates("sequence")
    print(len(df_vis))
    if baseline == "MRL+GA":
        df_vis = df_vis.sort_values(by="MRL", ascending=False).iloc[:top_N]

    print(f"\tNumber of top-ranked sequences: {len(df_vis)}")
    for i in range(len(df_vis)):
        y_vis = df_vis[obj_names].iloc[i]
        y_vis = (y_vis - y_mins) / y_ranges
        ax.plot(x_vis, y_vis, linestyle="-", color=tmp_color, alpha=0.6, linewidth=2)
    
    if baseline == "MRL+GA":
        ax.set_title(f"MRL+GA Top20", fontsize=20, pad=10)
    elif baseline == "MRLdesign2024":
        ax.set_title(f"MRL Design from Castillo-Hair, Nat. Commun. 2024", fontsize=20, pad=10)
    fig.tight_layout()
    if SAVEFIG:
        fig.savefig(f"./figs/baseline_ppd_{baseline}.png", dpi=300)
    plt.close()

19
	Number of top-ranked sequences: 19
20
	Number of top-ranked sequences: 20
