In [2]:
import argparse
import os
from datetime import datetime
import pytz
import json
from collections import Counter
import sys

os.environ["CUDA_VISIBLE_DEVICES"] = '2'
import torch.nn.functional as F
import torch
# from transformers import T5ForConditionalGeneration

from src.utils.helpers import * 
from src.utils.tokenizer import *

ModuleNotFoundError: No module named 'tokenizers'

In [2]:
MAX_LEN = 32

In [None]:
import argparse
def parse_opt():
    ################################################################ Arguments
    parser = argparse.ArgumentParser(description='Sampling Hyperparameters')

    # Training Configuration
    parser.add_argument('--train-hyp', default="/data6/sobhan/RLLM/hyps/train.yaml", type=str, help='Training hyperparameters')
    parser.add_argument('--model-hyp', default="/data6/sobhan/RLLM/hyps/t5.yaml", type=str, help='Model hyperparameters')

    # Generation Configurations
    parser.add_argument('--checkpoints', default='/data6/sobhan/rllm/results/train/t5/run3_20240822-152114/checkpoints/checkpoint-349800', type=str, help='Load Model')
    parser.add_argument('--eval-dir', default="/data6/sobhan/RLLM/results/validation/pool", type=str, help='Output dir of the evaluation')
    parser.add_argument('--proteins', nargs='+', default=['hnrpnc', 'ago2', 'elavl1', 'rbm5'], type=str, help='List of protein names or IDs')
    parser.add_argument('--rna_num', default=128, type=int, help='Number of RNAs to generate per setting')

    args = parser.parse_args('')    
    args.device = 'cuda' if torch.cuda.is_available() else 'cpu'

    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"GPU {i} ID: cuda:{i}")
    return args

args = parse_opt()

NameError: name 'argparse' is not defined

In [4]:
def gen_rna_batch(model, prot_ids, dec_tok, num_candidates, tolerance=5, max_token=MAX_LEN, strategy='beam_search', temperature=1.0, num_beams=5, top_k=None, top_p=None):
    inputs = torch.tensor(prot_ids, dtype=torch.long).unsqueeze(0).to(model.device)
    
    candidate_rnas = []
    while len(candidate_rnas) < num_candidates:
        with torch.no_grad():
            gen_args = {
                'max_length': 15,
                'repetition_penalty': 1.5,
                'encoder_repetition_penalty': 1.3,
                'num_return_sequences': num_candidates,
            }
            if strategy == 'beam_search':
                # For beam search, ensure that num_return_sequences <= num_beams.
                effective_beams = max(num_beams, num_candidates)
                gen_args.update({
                    'do_sample': False,
                    'num_beams': effective_beams,
                    'num_return_sequences': effective_beams,
                })
            elif strategy == 'top_k':
                gen_args.update({
                    'do_sample': True,
                    'temperature': temperature,
                    'top_k': top_k if top_k is not None else 50,
                    'num_beams': num_beams,
                })
            elif strategy == 'top_p':
                gen_args.update({
                    'do_sample': True,
                    'temperature': temperature,
                    'top_p': top_p if top_p is not None else 0.92,
                    'num_beams': num_beams,
                })
            else:  # Simple sampling
                gen_args.update({
                    'do_sample': True,
                    'temperature': temperature,
                    'num_beams': num_beams,
                })
                
            seqs = model.generate(inputs, **gen_args)
            
        decoded_rnas = [
            postprocess_rna(dec_tok.decode(seq.cpu().numpy().tolist()))
            for seq in seqs
        ]
        new_candidates = [
            rna for rna in decoded_rnas
            if (max_token - tolerance) <= len(rna) <= (max_token + tolerance)
        ]
        candidate_rnas.extend(new_candidates)
        candidate_rnas = candidate_rnas[:num_candidates]
    return candidate_rnas

In [5]:
def pool_to_fasta(protein_name, pool, eval_dir):
    pool_filename = os.path.join(eval_dir, f"{protein_name}_pool.fasta")
    with open(pool_filename, "w") as f:
        for idx, cand in enumerate(pool):
            param_str = "_".join([f"{k}_{v}" for k, v in cand["hyperparams"].items()])
            header = f">RNA_{idx}_{cand['strategy']}_{param_str}"
            f.write(header + "\n")
            f.write(cand["rna"] + "\n")
    print(f"Pool file saved: {pool_filename}")
    return pool_filename

In [6]:
def create_pool(args, model, source_tokenizer, rna_tokenizer):
    grid_config = {
        'beam_search': [
            {'num_beams': 1},
            {'num_beams': 5},
            {'num_beams': 25}
        ],
        'top_k': [
            {'top_k': 30, 'temperature': 0.7, 'num_beams': 1},
            {'top_k': 30, 'temperature': 1.0, 'num_beams': 1},
            {'top_k': 30, 'temperature': 1.5, 'num_beams': 1},
            {'top_k': 100, 'temperature': 0.7, 'num_beams': 1},
            {'top_k': 100, 'temperature': 1.0, 'num_beams': 1},
            {'top_k': 100, 'temperature': 1.5, 'num_beams': 1},
        ],
        'top_p': [
            {'top_p': 0.7, 'temperature': 0.7, 'num_beams': 1},
            {'top_p': 0.7, 'temperature': 1.0, 'num_beams': 1},
            {'top_p': 0.7, 'temperature': 1.5, 'num_beams': 1},
            {'top_p': 0.9, 'temperature': 0.7, 'num_beams': 1},
            {'top_p': 0.9, 'temperature': 1.0, 'num_beams': 1},
            {'top_p': 0.9, 'temperature': 1.5, 'num_beams': 1},
        ],
        'sample': [
            {'temperature': 0.7, 'num_beams': 1},
            {'temperature': 1.0, 'num_beams': 1},
            {'temperature': 1.5, 'num_beams': 1},
        ]
    }
    
    os.makedirs(args.eval_dir, exist_ok=True)
    for protein_name in args.proteins:
        protein_seq = read_protein_from_csv(protein_name, file_path="/data6/sobhan/dataset/proteins/protein_seqs.csv")
        if protein_seq is None:
            print(f"Warning: Protein {protein_name} not found.")
            continue
        print("Tokenizing Protein:", protein_name)
        prot_ids = source_tokenizer.tokenize(protein_seq).ids
        
        pool = []
        for strategy, hyper_list in grid_config.items():
            for hyperparams in hyper_list:
                temperature = hyperparams.get('temperature', 1.0)
                num_beams = hyperparams.get('num_beams', 1)
                top_k = hyperparams.get('top_k', None)
                top_p = hyperparams.get('top_p', None)
                
                print(f"Generating for Protein: {protein_name}, Strategy: {strategy}, Hyperparameters: {hyperparams}")
                candidate_rnas = gen_rna_batch(
                    model,
                    prot_ids,
                    rna_tokenizer,
                    args.rna_num,
                    strategy=strategy,
                    temperature=temperature,
                    num_beams=num_beams,
                    top_k=top_k,
                    top_p=top_p
                )
                for idx, rna in enumerate(candidate_rnas):
                    pool.append({
                        "rna": rna,
                        "strategy": strategy,
                        "hyperparams": hyperparams,
                        "id": f"RNA_{idx}_{strategy}_{'_'.join([f'{k}_{v}' for k,v in hyperparams.items()])}"
                    })
        
        pool_file = pool_to_fasta(protein_name, pool, args.eval_dir)

Number of GPUs available: 1
GPU 0: NVIDIA GeForce RTX 2080 Ti
GPU 0 ID: cuda:0


In [8]:
model = T5ForConditionalGeneration.from_pretrained(args.checkpoints).to(args.device)
model.eval()

source_tokenizer = get_tokenizer(
    tokenizer_name=args.tokenizer,
    vocab_size=args.vocab_size,
    seq_size=args.seq_size,
    tokenizer_path=args.source_tokenizer
)
rna_tokenizer = get_tokenizer(
    tokenizer_name=args.tokenizer,
    vocab_size=args.vocab_size,
    seq_size=args.seq_size,
    tokenizer_path=args.rna_tokenizer
)

enable_padding(max_length=X) is deprecated, use enable_padding(length=X) instead
enable_padding(max_length=X) is deprecated, use enable_padding(length=X) instead


In [9]:
create_pool(args, model, source_tokenizer, rna_tokenizer)

Tokenizing Protein: hnrpnc
Generating for Protein: hnrpnc, Strategy: beam_search, Hyperparameters: {'num_beams': 1}
Generating for Protein: hnrpnc, Strategy: beam_search, Hyperparameters: {'num_beams': 5}
Generating for Protein: hnrpnc, Strategy: beam_search, Hyperparameters: {'num_beams': 25}
Generating for Protein: hnrpnc, Strategy: top_k, Hyperparameters: {'top_k': 30, 'temperature': 0.7, 'num_beams': 1}
Generating for Protein: hnrpnc, Strategy: top_k, Hyperparameters: {'top_k': 30, 'temperature': 1.0, 'num_beams': 1}
Generating for Protein: hnrpnc, Strategy: top_k, Hyperparameters: {'top_k': 30, 'temperature': 1.5, 'num_beams': 1}
Generating for Protein: hnrpnc, Strategy: top_k, Hyperparameters: {'top_k': 100, 'temperature': 0.7, 'num_beams': 1}
Generating for Protein: hnrpnc, Strategy: top_k, Hyperparameters: {'top_k': 100, 'temperature': 1.0, 'num_beams': 1}
Generating for Protein: hnrpnc, Strategy: top_k, Hyperparameters: {'top_k': 100, 'temperature': 1.5, 'num_beams': 1}
Genera

KeyboardInterrupt: 

In [1]:
%load_ext autoreload
%autoreload 2
import sys
import os
import json
import copy 
from collections import Counter
import numpy as np

from src.utils.validations import calculate_mfe_many
from src.utils.helpers import read_protein_from_csv


project1_path = '/data6/sobhan/deepclip'
sys.path.append(project1_path)
import constants
import network
from data_gen import onehot_encode
from data_gen import onehot_binary

def encode_input_data(seqs, max_length):
    pad_sequences_with_N(seqs, max_length)
    return seqs

def pad_sequences_with_N(added_seqs, length):
    for i in range(len(added_seqs)):
        begin = end = 0  # make sure these are zero
        if len(added_seqs[i]) < length:
            missing = int(length - len(added_seqs[i]))
            begin = int(missing/2)
            end = int(missing - begin)
        added_seqs[i] = begin*'n' + added_seqs[i] + end*'n'
        if len(added_seqs[i]) != length:
            print(str(len(added_seqs[i])))
            print(str(i))
            break
    return added_seqs


clip_directory = "/data6/sobhan/RLLM_OPT/deepclip_models" 

In [2]:
def calc_deep_clip(rna_sequences, protein_name) -> int:
    clip_file = next(
        (os.path.join(root1, file1) for root1, _, files1 in os.walk(clip_directory)
            for file1 in files1 if protein_name.lower() in file1.lower()),
        None
    )
    print("DeepCLIP file: ", clip_file)
    try:
        net, freq = network.load_network(clip_file)
        options = net.options
        predict_fn, outpar = net.compile_prediction_function()
        output_shape = net.network['l_in'].output_shape
    except Exception as e:
        raise ValueError(f"Error loading network: {e}")

    max_filter_size = max(options["FILTER_SIZES"]) / len(constants.VOCAB)
    max_network_length = int(options["SEQ_SIZE"] - 2 * (max_filter_size - 1))
    max_input_length = max(map(len, rna_sequences))

    if max_input_length > max_network_length:
        raise ValueError(f"Input sequences exceed the maximum network length ({max_network_length}).")

    # Encode and predict
    seq_list = encode_input_data(rna_sequences, max_network_length + 2 * (max_filter_size - 1))
    X_test = onehot_encode(seq_list, freq, vocab=constants.VOCAB)
    results = network.predict_without_network(predict_fn, options, output_shape, X_test, outpar)
    predictions = results["predictions"]
    return predictions

In [3]:
def read_fasta(fasta_file):
    candidates = []
    with open(fasta_file, 'r') as f:
        header = None
        seq_lines = []
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith(">"):
                if header is not None:
                    candidates.append({"header": header, "rna": "".join(seq_lines)})
                header = line[1:]
                seq_lines = []
            else:
                seq_lines.append(line)
        if header is not None:
            candidates.append({"header": header, "rna": "".join(seq_lines)})
    return candidates

In [21]:
import argparse
def parse_opt():
    ################################################################ Arguments
    parser = argparse.ArgumentParser(description='Sampling Hyperparameters')

    # Training Configuration
    parser.add_argument('--train-hyp', default="/data6/sobhan/RLLM/hyps/train.yaml", type=str, help='Training hyperparameters')
    parser.add_argument('--model-hyp', default="/data6/sobhan/RLLM/hyps/t5.yaml", type=str, help='Model hyperparameters')

    # Generation Configurations
    parser.add_argument('--checkpoints', default='/data6/sobhan/rllm/results/train/t5/run3_20240822-152114/checkpoints/checkpoint-349800', type=str, help='Load Model')
    parser.add_argument('--eval-dir', default="/data6/sobhan/RLLM/results/validation/pool", type=str, help='Output dir of the evaluation')
    parser.add_argument('--proteins', nargs='+', default=['hnrpnc', 'ago2', 'elavl1', 'rbm5'], type=str, help='List of protein names or IDs')
    parser.add_argument('--rna_num', default=128, type=int, help='Number of RNAs to generate per setting')
    parser.add_argument('--top_num', default=10, type=int, help='Number of TOP n RNAs to be selected from the Pool of RNAs')


    args = parser.parse_args('')    
    return args

args = parse_opt()

In [16]:

def calc_scores(candidates, phi_cons=0.3, phi2_mfe=0.1, phi3_bind=1):
    rnas = [cand['rna'] for cand in candidates]

    #Binding Affinity
    predictions = calc_deep_clip(copy.deepcopy(rnas), protein_name) 

    #Foldability
    mfes = calculate_mfe_many(copy.deepcopy(rnas)) 
    np_mfes = np.array(copy.copy(mfes))
    normalized_mfes = list(1 - (mfes - np.min(np_mfes)) / (np.max(np_mfes) - np.min(np_mfes)))

    # Consistency score
    vote_counts = Counter(copy.deepcopy(rnas))
    max_vote = vote_counts.most_common(1)[0][1]

    for index, cand in enumerate(candidates):
        cand["deepclip_score"] = predictions.pop(0)[0]
        cand["MFE"] = mfes.pop(0)
        cand["MFE_score"] = normalized_mfes.pop(0)
        cand["vote_score"] = vote_counts[cand["rna"]] / max_vote
        cand["final_score"] = phi3_bind*cand["deepclip_score"] + phi2_mfe*cand["MFE_score"] + phi_cons*cand["vote_score"]

    return candidates



In [None]:
for protein_name in args.proteins:
    protein_seq = read_protein_from_csv(protein_name, file_path="/data6/sobhan/dataset/proteins/protein_seqs.csv")
    print(protein_name)
    if protein_seq is None:
        continue
    pool_file = os.path.join(args.eval_dir, f"{protein_name}_pool.fasta")
    if os.path.exists(pool_file):
        candidates = read_fasta(pool_file) # READING POOL FILE
        candidates = calc_scores(candidates) # CALCULATING SCORES
        sorted_candidates = sorted(candidates, key=lambda x: x["final_score"], reverse=True) # SORTING

        # Remove the duplicated RNAs
        seen = set()
        unique_candidates = []
        for cand in sorted_candidates:
            if cand["rna"] not in seen:
                unique_candidates.append(cand)
                seen.add(cand["rna"])
            if len(unique_candidates) >= args.top_num:
                break

        filtered_filename = os.path.join(os.path.dirname(pool_file), f"{protein_name}_filtered.fasta")
        with open(filtered_filename, "w") as f:
            for idx, cand in enumerate(unique_candidates):
                header = f">RNA_{idx}_{cand.get('header', 'NA')}_score_{cand['final_score']:.3f}"
                f.write(header + "\n")
                f.write(cand["rna"] + "\n")
    else:
        print(f"Pool file for {protein_name} not found.")



hnrpnc
DeepCLIP file:  /data6/sobhan/RLLM_OPT/deepclip_models/hnRPNC_GP.pkl
 Making prediction function ...

In [18]:
candidates

[{'header': 'RNA_0_beam_search_num_beams_1',
  'rna': 'AUGGCCCUCCCGACACCUUCGGACAGC',
  'deepclip_score': 0.022768741437720605,
  'MFE': -4.099999904632568,
  'vote_score': 0.631578947368421,
  'final_score': 0.23122390635298748,
  'MFE_score': 0.18981480704740583},
 {'header': 'RNA_1_beam_search_num_beams_1',
  'rna': 'AUGAGCGCUAGCGGCCCCGAGGCUCCUGGGGACAUU',
  'deepclip_score': 0.0011872412820267903,
  'MFE': -11.699999809265137,
  'vote_score': 0.7894736842105263,
  'final_score': 0.29219601137220186,
  'MFE_score': 0.5416666482701717},
 {'header': 'RNA_2_beam_search_num_beams_1',
  'rna': 'UGGGGCGGGGCAGGGGCGCUCCCAGCC',
  'deepclip_score': 0.05052844881468488,
  'MFE': -10.899999618530273,
  'vote_score': 0.5263157894736842,
  'final_score': 0.2588861459624816,
  'MFE_score': 0.5046296030569146},
 {'header': 'RNA_3_beam_search_num_beams_1',
  'rna': 'CGGGGACGGGGGCACAAUCCCGGGGCCCCC',
  'deepclip_score': 0.09460351340439217,
  'MFE': -14.800000190734863,
  'vote_score': 0.473684210526315