In [14]:
import os, re
from sklearn import preprocessing
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import pandas as pd
import numpy as np
import torch
from botorch.utils.multi_objective import pareto
import numpy as np
import matplotlib.pyplot as plt
import shutil

### 1. Setting up the MRL scaler  
The Optimus5Prime provided in the directory is trained using a normalized MRL values, and therefore predict normalized values. The scaler loaded here will transform the noramlized values back to the actual MRL values.

In [2]:
label_data = pd.read_csv(os.path.join("./data/GSM3130435_egfp_unmod_1_processed.tsv"), sep="\t")["rl"]
scaler = preprocessing.StandardScaler().fit(np.array(label_data).reshape(-1,1))

In [3]:
def convert_obj_to_real(df_tmp, column_name, obj_name):
    if obj_name == "AGC content [%]":
        obj_val = np.array(df_tmp[column_name])
        seq_length = len(df_tmp["cand_seq"].iloc[0])
        obj_val = (seq_length - obj_val) / seq_length * 100
    elif obj_name == "MRL":
        obj_val = np.array(df_tmp[column_name])
        obj_val = scaler.inverse_transform(-obj_val.reshape(-1,1)).flatten()
    elif obj_name == "G4 score":
        obj_val = - np.array(df_tmp[column_name])
    elif obj_name == "in vitro stability":
        obj_val = - np.array(df_tmp[column_name])
    return obj_val

def format_to_rna(tmp_array):
    for i, seq in enumerate(tmp_array):
        tmp_array[i] = seq.replace("T", "U")
    return tmp_array
    
def format_init(df_init):
    df_init = df_init.rename(
        columns={
            "sequence":"cand_seq",
            "nonU_cotent":"obj_val_0",
            "mrl":"obj_val_1",
            "G4":"obj_val_2",
            "degradation":"obj_val_3"
        }
    )
    df_init["round_idx"] = [0,]*len(df_init)
    df_init["cand_uuid"] = ["",]*len(df_init)
    df_init["cand_ancestor"] = ["",]*len(df_init)
    # df_init = df_init.reindex(columns=["round_idx","cand_uuid","cand_ancestor","cand_seq","obj_val_0","obj_val_1","obj_val_2","obj_val_3"])
    return df_init

# assume all objectives are to be maximized
def pareto_ranking(df_all, df_pareto, obj_names):
    rank_scores = []
    for i in range(len(df_pareto)):
        df_tmp = df_all.copy()
        for tmp_obj in obj_names:
            df_tmp = df_tmp[df_tmp[tmp_obj]<=df_pareto[tmp_obj].iloc[i]]
        rank_scores.append(len(df_tmp)+1)
    df_pareto["score_pareto_ranking"] = rank_scores
    df_pareto = df_pareto.sort_values(by="score_pareto_ranking", ascending=False)
    return df_pareto


In [49]:
def collect_pareto_fronts(model_flg, data_dir, obj_columns, obj_names, save_dir, save_file):
    df_all = pd.DataFrame([], columns=obj_names)
    df_pareto = pd.DataFrame()
    if model_flg == "init":
        for init_num in [2,3,4]:
            initset = f"initset{init_num}"
            init_file = f"../cache_dir/Sample2019_unmod1_{initset}_numsample512_ConstructSample2019EGFP.csv"
            df_init = format_init(pd.read_csv(init_file))

            df_save = pd.DataFrame([], columns=obj_names)
            new_points = None
            for obj_column, obj_name in zip(obj_columns, obj_names):
                tmp_obj = convert_obj_to_real(df_init, obj_column, obj_name)
                if new_points is None:
                    new_points = np.expand_dims(tmp_obj, -1)
                else:
                    new_points = np.concatenate([new_points, np.expand_dims(tmp_obj, -1)], -1)
            df_save["sequence"] = format_to_rna(np.array(df_init["cand_seq"]))
            df_save["initset"] = initset
            df_save[obj_names] = new_points
            if len(df_all) == 0:
                df_all = df_save.copy()
            else:
                df_all = pd.concat([df_all, df_save], axis=0)
            
            all_seqs = df_save[["sequence", "initset"]].values
            pareto_mask = pareto.is_non_dominated(torch.tensor(new_points))
            pareto_idx = np.argsort(new_points[pareto_mask][:,0])
            pareto_front = np.array(all_seqs[pareto_mask][pareto_idx]) # np.expand_dims(, -1)
            pareto_front = np.concatenate([pareto_front, new_points[pareto_mask][pareto_idx]], axis=-1)
            df_pareto_save = pd.DataFrame(pareto_front, columns=["sequence", "initset"]+obj_names)
            if len(df_pareto) == 0:
                df_pareto = df_pareto_save.copy()
            else:
                df_pareto = pd.concat([df_pareto, df_pareto_save], axis=0)
    else:
        for tmp_file in os.listdir(data_dir):
            if model_flg in tmp_file:
                initset = re.search("initset[0-9]+", tmp_file).group()
                init_file = f"../cache_dir/Sample2019_unmod1_{initset}_numsample512_ConstructSample2019EGFP.csv"
                df_init = format_init(pd.read_csv(init_file))

                df_tmp = pd.read_csv(os.path.join(data_dir, tmp_file))
                df_tmp = pd.concat([df_init, df_tmp], axis=0)
                df_tmp = df_tmp.drop_duplicates(subset=["cand_seq"], keep="first")
                df_tmp = df_tmp.reset_index(drop=True)

                df_save = pd.DataFrame([], columns=obj_names)
                new_points = None
                for obj_column, obj_name in zip(obj_columns, obj_names):
                    tmp_obj = convert_obj_to_real(df_tmp, obj_column, obj_name)
                    if new_points is None:
                        new_points = np.expand_dims(tmp_obj, -1)
                    else:
                        new_points = np.concatenate([new_points, np.expand_dims(tmp_obj, -1)], -1)
                df_save["sequence"] = format_to_rna(np.array(df_tmp["cand_seq"]))
                df_save["initset"] = initset
                df_save[obj_names] = new_points
                if len(df_all) == 0:
                    df_all = df_save.copy()
                else:
                    df_all = pd.concat([df_all, df_save], axis=0)
                
                all_seqs = df_save[["sequence", "initset"]].values
                pareto_mask = pareto.is_non_dominated(torch.tensor(new_points))
                pareto_idx = np.argsort(new_points[pareto_mask][:,0])
                pareto_front = np.array(all_seqs[pareto_mask][pareto_idx]) # np.expand_dims(, -1)
                pareto_front = np.concatenate([pareto_front, new_points[pareto_mask][pareto_idx]], axis=-1)
                df_pareto_save = pd.DataFrame(pareto_front, columns=["sequence", "initset"]+obj_names)
                if len(df_pareto) == 0:
                    df_pareto = df_pareto_save.copy()
                else:
                    df_pareto = pd.concat([df_pareto, df_pareto_save], axis=0)

    df_all = df_all.drop_duplicates(subset=["sequence"], keep="first")
    df_all = df_all.reset_index(drop=True)
    print(f"\tNumber of generated sequences: {len(df_all)}")
    df_pareto = pareto_ranking(df_all, df_pareto, obj_names)
    print(f"\tNumber of non-dominated sequences: {len(df_pareto)}")
    save_path = os.path.join(save_dir, save_file)
    print(f"\tSaving file to {save_path}")
    df_pareto.to_csv(save_path, index=False)

def cluster_seqs(save_dir, save_file, usearch_path):
    csv_path = os.path.join(save_dir, save_file)
    file_prefix = save_file.split(".")[0]
    fasta_path = os.path.join(save_dir, file_prefix + ".fasta")

    df = pd.read_csv(csv_path)
    tmp_out_file = os.path.join(fasta_path)
    fd = open(tmp_out_file, "w")
    seq_list = []
    for i in range(len(df)):
        # desc = "seq{}_from_{}".format(i+1, df["run"].iloc[i])
        seq_list.append(SeqRecord(Seq(df["sequence"].iloc[i]), id=str(i)))
    SeqIO.write(seq_list, fd, "fasta")
    fd.close()

    min_sequence_identity = 60
    cluster_path = os.path.join(save_dir, "cluster{}".format(min_sequence_identity))
    options = {
        "cluster_fast": fasta_path,
        "id": min_sequence_identity / 100,
        "clusters": cluster_path + "/c_",
    }

    if "clusters" in options.keys():
        if os.path.exists(cluster_path):
            shutil.rmtree(cluster_path)
        os.makedirs(cluster_path)

    cmd = usearch_path
    for k, v in options.items():
        cmd = cmd + " -{} {}".format(k, v)
    print(f"\tExecuting UCLUST:\n\t{cmd}\n")
    os.system(cmd)
    return cluster_path

def assign_clusters(save_dir, save_file, cluster_path):
    print("\n\tassigning a cluster to each seqeunce")
    csv_path = os.path.join(save_dir, save_file)
    df = pd.read_csv(csv_path)
    cluster_dirs = os.listdir(cluster_path)
    cluster_list = [""] * len(df)
    for cluster in cluster_dirs:
        fasta_sequences = SeqIO.parse(open(os.path.join(cluster_path, cluster)),"fasta")
        for fasta in fasta_sequences:
            sequence = str(fasta.seq)
            idx = df.query("sequence == "{}"".format(sequence)).index[0]
            cluster_list[idx] = cluster
    df["cluster"] = cluster_list
    print(f"\tsaving file to {csv_path}")
    df.to_csv(csv_path, index=False)


### 2. Ranking generated sequences
Retrieving generated sequences during LaMBO trainings.  
For each corresponding run, the table of generated sequences should be uploaded to your wandb workspace as "task_mugd_cnn/candidates" or you can find them under your local wandb directory such as `./wandb/RUNTITLE/files/media/table/task_mugd_cnn/candidates_ID.table.json`. 

In [51]:
models = ["init", "CNN", "BERT", "DNABERT", ]
data_dir = "./generated_seqs"
# f"../cache_dir/Sample2019_unmod1_{initset_flg}_numsample512_ConstructSample2019EGFP.csv"
obj_columns = ["obj_val_0","obj_val_1","obj_val_2","obj_val_3"]
obj_names = ["AGC content [%]", "MRL", "G4 score", "in vitro stability"]
usearch_path = "/home/keisuke-yamada/tools/usearch11.0.667_i86linux32"

if not os.path.exists("./tmp"):
    os.makedirs("./tmp")

for model_flg in models:
    print(f"Processing {model_flg}")
    collect_pareto_fronts(model_flg, data_dir, obj_columns, obj_names, save_dir="./tmp", save_file="tmp.csv")
    cluster_path = cluster_seqs(save_dir="./tmp", save_file="tmp.csv", usearch_path=usearch_path)
    assign_clusters(save_dir="./tmp", save_file="tmp.csv", cluster_path=cluster_path)
    df = pd.read_csv("./tmp/tmp.csv")
    df = df.sort_values(by="score_pareto_ranking", ascending=False)
    df = df.drop_duplicates(subset=["cluster"], keep="first")
    print(f"Saving selected seqs to {model_flg}_selected.csv")
    df.to_csv(f"{model_flg}_rank.csv", index=False)

Processing init
	Number of generated sequences: 1530
	Number of non-dominated sequences: 112
	Saving file to ./tmp/tmp.csv
	Executing UCLUST:
	/home/keisuke-yamada/tools/usearch11.0.667_i86linux32 -cluster_fast ./tmp/tmp.fasta -id 0.6 -clusters ./tmp/cluster60/c_

usearch v11.0.667_i86linux32, 4.0Gb RAM (330Gb total), 16 cores
(C) Copyright 2013-18 Robert C. Edgar, all rights reserved.
https://drive5.com/usearch

License: personal use only


	assigning a cluster to each seqeunce


00:00 41Mb    100.0% Reading ./tmp/tmp.fasta
00:00 7.1Mb  CPU has 16 cores, defaulting to 10 threads
00:00 315Mb   100.0% DF
00:00 319Mb  112 seqs, 112 uniques, 112 singletons (100.0%)
00:00 319Mb  Min size 1, median 1, max 1, avg 1.00
00:00 322Mb   100.0% DB
00:00 328Mb   100.0% 112 clusters, max size 1, avg 1.0
00:00 328Mb   100.0% Writing clusters                 
                                     
      Seqs  112
  Clusters  112
  Max size  1
  Avg size  1.0
  Min size  1
Singletons  112, 100.0% of seqs, 100.0% of clusters
   Max mem  328Mb
      Time  1.00s
Throughput  112.0 seqs/sec.



	saving file to ./tmp/tmp.csv
Saving selected seqs to init_selected.csv
Processing DNABERT
	Number of generated sequences: 7533
	Number of non-dominated sequences: 394
	Saving file to ./tmp/tmp.csv
	Executing UCLUST:
	/home/keisuke-yamada/tools/usearch11.0.667_i86linux32 -cluster_fast ./tmp/tmp.fasta -id 0.6 -clusters ./tmp/cluster60/c_

usearch v11.0.667_i86linux32, 4.0Gb RAM (330Gb total), 16 cores
(C) Copyright 2013-18 Robert C. Edgar, all rights reserved.
https://drive5.com/usearch

License: personal use only


	assigning a cluster to each seqeunce


00:00 41Mb    100.0% Reading ./tmp/tmp.fasta
00:00 7.1Mb  CPU has 16 cores, defaulting to 10 threads
00:00 316Mb   100.0% DF
00:00 319Mb  394 seqs, 394 uniques, 394 singletons (100.0%)
00:00 319Mb  Min size 1, median 1, max 1, avg 1.00
00:00 322Mb   100.0% DB
00:00 328Mb   100.0% 47 clusters, max size 45, avg 8.4
00:00 328Mb   100.0% Writing clusters                 
                                     
      Seqs  394
  Clusters  47
  Max size  45
  Avg size  8.4
  Min size  1
Singletons  6, 1.5% of seqs, 12.8% of clusters
   Max mem  328Mb
      Time  1.00s
Throughput  394.0 seqs/sec.



	saving file to ./tmp/tmp.csv
Saving selected seqs to DNABERT_selected.csv
Processing BERT
	Number of generated sequences: 13664
	Number of non-dominated sequences: 873
	Saving file to ./tmp/tmp.csv
	Executing UCLUST:
	/home/keisuke-yamada/tools/usearch11.0.667_i86linux32 -cluster_fast ./tmp/tmp.fasta -id 0.6 -clusters ./tmp/cluster60/c_

usearch v11.0.667_i86linux32, 4.0Gb RAM (330Gb total), 16 cores
(C) Copyright 2013-18 Robert C. Edgar, all rights reserved.
https://drive5.com/usearch

License: personal use only


	assigning a cluster to each seqeunce


00:00 41Mb    100.0% Reading ./tmp/tmp.fasta
00:00 7.1Mb  CPU has 16 cores, defaulting to 10 threads
00:00 320Mb   100.0% DF
00:01 319Mb  873 seqs, 873 uniques, 873 singletons (100.0%)
00:01 319Mb  Min size 1, median 1, max 1, avg 1.00
00:01 322Mb   100.0% DB
00:01 328Mb   100.0% 103 clusters, max size 70, avg 8.5
00:01 328Mb   100.0% Writing clusters                  
                                     
      Seqs  873
  Clusters  103
  Max size  70
  Avg size  8.5
  Min size  1
Singletons  10, 1.1% of seqs, 9.7% of clusters
   Max mem  328Mb
      Time  1.00s
Throughput  873.0 seqs/sec.



	saving file to ./tmp/tmp.csv
Saving selected seqs to BERT_selected.csv
Processing CNN
	Number of generated sequences: 7528
	Number of non-dominated sequences: 524
	Saving file to ./tmp/tmp.csv
	Executing UCLUST:
	/home/keisuke-yamada/tools/usearch11.0.667_i86linux32 -cluster_fast ./tmp/tmp.fasta -id 0.6 -clusters ./tmp/cluster60/c_

usearch v11.0.667_i86linux32, 4.0Gb RAM (330Gb total), 16 cores
(C) Copyright 2013-18 Robert C. Edgar, all rights reserved.
https://drive5.com/usearch

License: personal use only


	assigning a cluster to each seqeunce


00:00 41Mb    100.0% Reading ./tmp/tmp.fasta
00:00 7.1Mb  CPU has 16 cores, defaulting to 10 threads
00:00 316Mb   100.0% DF
00:00 319Mb  524 seqs, 524 uniques, 524 singletons (100.0%)
00:00 319Mb  Min size 1, median 1, max 1, avg 1.00
00:00 322Mb   100.0% DB
00:00 328Mb   100.0% 36 clusters, max size 73, avg 14.6
00:00 328Mb   100.0% Writing clusters                  
                                     
      Seqs  524
  Clusters  36
  Max size  73
  Avg size  14.6
  Min size  1
Singletons  3, 0.6% of seqs, 8.3% of clusters
   Max mem  328Mb
      Time  1.00s
Throughput  524.0 seqs/sec.



	saving file to ./tmp/tmp.csv
Saving selected seqs to CNN_selected.csv


### 3. Selecting candidates for wet-lab experiments

In [52]:
models = ["init", "CNN", "BERT", "DNABERT", ]
df_cand = pd.DataFrame()
for i in models:
    df = pd.read_csv("{}_rank.csv".format(i))
    df = df.sort_values(by="score_pareto_ranking", ascending=False)
    df = df.drop_duplicates(subset=["cluster"], keep="first")
    df["model"] = i
    if i=="init":
        df_cand = pd.concat([df_cand, df.iloc[:3,:]])
    else:
        df_cand = pd.concat([df_cand, df.iloc[:5,:]])
df_cand.to_csv("candidates.csv", index=False)
df_cand

Unnamed: 0,sequence,initset,AGC content [%],MRL,G4 score,in vitro stability,score_pareto_ranking,cluster,model
0,AGUUCACCAAAGAAGGAAAUUCACAGCUGCAAAUGAGCACGGCAAA...,initset4,88.0,7.441092,-0.2,-0.425877,1244,c_43,init
1,CCCAAAGCCAAUACGAAGCAACAGUUGGAUCCUAAGUCCUCUGAAA...,initset2,84.0,7.58064,-0.3,-0.422809,1117,c_0,init
2,CCCACCAUUUCGACCAAACGAACAUCAACCAAACAAGACACUGUGU...,initset4,86.0,7.158506,-0.3,-0.424855,1021,c_1,init
0,CACUAGGAAGAGAAGAGAAAAGAAAAUACACAAAUACAAAGGAGCA...,initset3,94.0,7.895419,-0.1,-0.417397,6525,c_0,CNN
1,CAGAGAGAAAGCACACAGGAAGACACAAGGACAAGACGCAGAGGAU...,initset3,98.0,7.654273,-0.2,-0.42211,4087,c_3,CNN
2,CAGACAGCAGUAGAGAGACAGAUCCUAACCCAGAAACACAAACAUC...,initset4,88.0,7.803958,-0.2,-0.42111,3840,c_1,CNN
3,CAAAAAAAGAAAAAGAACCAACAGUUGGACCCCAAAUCCUCUGCAA...,initset2,88.0,7.715036,-0.1,-0.421037,3786,c_5,CNN
4,CACAAGGAGUAGAACAAAUUCACACAAGCGAAAGAGUACACUUCAC...,initset3,84.0,7.946205,-0.2,-0.418353,3639,c_11,CNN
0,AAGAGGAAAAAAAAAAACAACGAAGAAAAAAAAGAGAGAAAGAGAA...,initset3,98.0,7.906004,-0.2,-0.412688,10967,c_0,BERT
1,AAGAAAGAAAAAAAAAACAAAGAAGAAAACAAAGAGACAAAGAAAA...,initset3,100.0,7.819129,-0.2,-0.410769,10337,c_2,BERT
