In [None]:

import os
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
max_length = 2048
dataset = load_dataset(
            "InstaDeepAI/genomics-long-range-benchmark",
            task_name='cage_prediction',
            sequence_length=max_length,
            cache_dir='data/genomic_long_range',
            trust_remote_code=True,
        )

train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

all_seqs=test_dataset['sequence']
all_labels=test_dataset['labels']
all_Chromosome=test_dataset['chromosome']


In [None]:
hyena_path='outputs/2024-04-28/07-05-57-202372/checkpoints/val/pearsonr_cage.ckpt'

batch_size=1


import torch
from sklearn.metrics import f1_score, roc_auc_score
from scipy import stats
def pearsonr_cage(outs, y, len_batch=None):
    # TODO: generalize, currently for Monash dataset
    metrics = []
    outs=outs.detach()
    # for i in range(50):
    #     y_true = y[:, :,i].cpu().numpy()
    #     outs_i = outs[:, :,i].cpu().numpy()
    
    #     r = stats.pearsonr(y_true.flatten(), outs_i.flatten())[0]
    #     metrics.append(r)
    
    for i in range(outs.shape[-1]):
        for j in range(outs.shape[0]):
            y_true = y[j, :,i].cpu().numpy()
            outs_i = outs[j, :,i].cpu().numpy()
            
        
            r=stats.pearsonr(y_true, outs_i)[0]
            metrics.append(r)
    #output non nan mean of metrics
    output=np.nanmean(metrics)

    # x_centered = outs - outs.mean(dim = 1, keepdim = True)
    # y_centered = y - y.mean(dim = 1, keepdim = True)
    # output=F.cosine_similarity(x_centered, y_centered, dim = 1).mean()

    return output

In [None]:
from transformers import AutoTokenizer,AutoModel
from torch import nn
from einops.layers.torch import Rearrange

state_dict='/weight/hyenadna/hyenadna-large-1m-seqlen'
d_model=256
max_length=2048

class hyena_model(nn.Module):
    def __init__(self):
        super(hyena_model, self).__init__()
        self.backbone=AutoModel.from_pretrained(state_dict, trust_remote_code=True).to('cuda')
        self.output_transform = nn.Linear(d_model, 218)
        self.linear = nn.Linear(2048,1)
        # self.final_pointwise = nn.Sequential(
        #     Rearrange('b n d -> b d n'),
        #     ConvBlock(d_model, d_model*2, 1),
        #     Rearrange('b d n -> b n d'),
        #     GELU()
        # )
        
        self.activation=nn.Softplus()
    
    def forward(self,input_ids,mask=None):
        hidden_state=self.backbone(input_ids).last_hidden_state
        if mask is None:
                restrict = lambda x: (
                    torch.cumsum(x, dim=-2)
                    / torch.arange(
                        1, 1 + x.size(-2), device=x.device, dtype=x.dtype
                    ).unsqueeze(-1)
                )[..., -1:, :]           
        else:
                # sum masks
                mask_sums = torch.sum(mask, dim=-1).squeeze() - 1  # for 0 indexing

                # convert mask_sums to dtype int
                mask_sums = mask_sums.type(torch.int64)

                restrict = lambda x: (
                    torch.cumsum(x, dim=-2)
                    / torch.arange(
                        1, 1 + x.size(-2), device=x.device, dtype=x.dtype
                    ).unsqueeze(-1)
                )[torch.arange(x.size(0)), mask_sums, :].unsqueeze(1)  # need to keep original shape
        
        # hidden_state= (torch.cumsum(hidden_state, dim=-2)
        #             / torch.arange(
        #                 1, 1 + hidden_state.size(-2), device=hidden_state
        #                 .device, dtype=hidden_state.dtype
        #             ).unsqueeze(-1)
        #         )[..., -1:, :]    
        hidden_state=restrict(hidden_state)       
        hidden_state = self.output_transform(hidden_state)
        output = hidden_state.squeeze(1)

        return output



checkpoint=torch.load('/outputs/2024-05-09/04-54-29-787887/checkpoints/val/spearmanr.ckpt')['state_dict']
torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
            checkpoint, "model."
        )
torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
            checkpoint, "decoder.0."
        )
hyena=hyena_model().to('cuda')
hyena.load_state_dict(checkpoint,strict=False)



In [None]:
from transformers import AutoTokenizer,AutoModel
import torch
from torch import nn


with torch.no_grad():
    state_dict='/weight/hyenadna/hyenadna-large-1m-seqlen'
    hyena_tokenizer=AutoTokenizer.from_pretrained(state_dict, trust_remote_code=True)
    hyena_model=AutoModel.from_pretrained(state_dict, trust_remote_code=True).to('cuda')
    full_sequence=[]
    checkpoint=torch.load(hyena_path)['state_dict']
    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
            checkpoint, "model.backbone."
        )
    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
            checkpoint, "decoder.0.output_transform."
        )

    hyena_decoder = nn.Linear(256,3).to('cuda')
    #edit key name in hyena_decoder
    
    hyena_model.load_state_dict(checkpoint,strict=False)
    hyena_decoder.load_state_dict(checkpoint,strict=False)
    hyena_model.eval()
    hyena_decoder.eval()

    target_list=[]
    seq_list=[]
    for i in range(len(all_seqs)):
        sequence_encoded=hyena_tokenizer(all_seqs[i],
                            add_special_tokens= False,  # this is what controls adding eos
                            padding="max_length",
                            max_length=max_length,
                            truncation=True,
                        )
        seq_ids=sequence_encoded['input_ids']
        seq_ids = torch.LongTensor(seq_ids)
        target = all_labels[i][:1000]
        
        seqs=torch.reshape(seq_ids,(1,max_length)).to('cuda')
        target_list.append(target)
        hidden_states=hyena_model(input_ids=seqs).last_hidden_state
        hidden_states=hidden_states[..., :1000, :]
        out1=hyena_decoder(hidden_states)
        out1_hyena=out1.squeeze(1).squeeze(0).cpu().detach().numpy()
        
        seq_list.append(out1_hyena)
        seq_list_numpy=np.array(seq_list)

        target_list_numpy=np.array(target_list)
        seq_list_tensor=torch.FloatTensor(seq_list_numpy)
        target_list_tensor=torch.FloatTensor(target_list_numpy)
        #calculate the 
        if i>=1:
            
            
            if i%1000==0:
                print(i)
                pearsonr=pr_auc(seq_list_tensor,target_list_tensor)
                print(pearsonr)
            

#plot the bar plot of the pearsonr


In [None]:
from transformers import AutoTokenizer,AutoModel
import torch
from torch import nn


max_length=3000
with torch.no_grad():
    state_dict='/weight/dnabert2'
    bert2_tokenizer=AutoTokenizer.from_pretrained(state_dict, trust_remote_code=True)
    bert2_model=AutoModel.from_pretrained(state_dict, trust_remote_code=True).to('cuda')
    full_sequence=[]
    checkpoint=torch.load(bert2_path)['state_dict']
    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
            checkpoint, "model.backbone."
        )
    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
            checkpoint, "decoder.0.output_transform."
        )

    bert2_decoder = nn.Linear(768,3).to('cuda')
    #edit key name in hyena_decoder
    
    bert2_model.load_state_dict(checkpoint,strict=False)
    bert2_decoder.load_state_dict(checkpoint,strict=False)
    bert2_model.eval()
    bert2_decoder.eval()

    target_list=[]
    seq_list=[]
    for i in range(len(all_seqs)):

        sequence_encoded=bert2_tokenizer(all_seqs[i],
                            add_special_tokens= False,  # this is what controls adding eos
                            padding="max_length",
                            max_length=max_length,
                            truncation=True,
                        )
        seq_ids=sequence_encoded['input_ids']
        seq_ids = torch.LongTensor(seq_ids)
        target = all_labels[i][:1000]
        
        seqs=torch.reshape(seq_ids,(1,max_length)).to('cuda')
        target_list.append(target)
        hidden_states=bert2_model(input_ids=seqs,export_hidden_states=True)[0]
        hidden_states=hidden_states[..., :1000, :]
        out1=bert2_decoder(hidden_states)
        out1_bert2=out1.squeeze(1).squeeze(0).cpu().detach().numpy()
        
        seq_list.append(out1_bert2)
        seq_list_numpy=np.array(seq_list)

        target_list_numpy=np.array(target_list)
        seq_list_tensor=torch.FloatTensor(seq_list_numpy)
        target_list_tensor=torch.FloatTensor(target_list_numpy)
        #calculate the 
        if i>=1:
            
            if i%1000==0:
                print(i)
                pearsonr=pr_auc(seq_list_tensor,target_list_tensor)
                print(pearsonr)
            

#plot the bar plot of the pearsonr


In [None]:
from transformers import AutoTokenizer,AutoModel
import torch
from torch import nn


max_length=3000
with torch.no_grad():
    state_dict='/weight/genalm/gena-lm-bigbird-base-t2t'
    genalm_tokenizer=AutoTokenizer.from_pretrained(state_dict, trust_remote_code=True)
    genalm_model=AutoModel.from_pretrained(state_dict, trust_remote_code=True).to('cuda')
    full_sequence=[]
    checkpoint=torch.load(genalm_path)['state_dict']
    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
            checkpoint, "model.backbone."
        )
    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
            checkpoint, "decoder.0.output_transform."
        )

    genalm_decoder = nn.Linear(768,3).to('cuda')
    #edit key name in hyena_decoder
    
    genalm_model.load_state_dict(checkpoint,strict=False)
    genalm_decoder.load_state_dict(checkpoint,strict=False)
    genalm_model.eval()
    genalm_decoder.eval()

    target_list=[]
    seq_list=[]
    for i in range(len(all_seqs)):

        sequence_encoded=genalm_tokenizer(all_seqs[i],
                            add_special_tokens= False,  # this is what controls adding eos
                            padding="max_length",
                            max_length=max_length,
                            truncation=True,
                        )
        seq_ids=sequence_encoded['input_ids']
        seq_ids = torch.LongTensor(seq_ids)
        target = all_labels[i][:1000]
        
        seqs=torch.reshape(seq_ids,(1,max_length)).to('cuda')
        target_list.append(target)
        hidden_states=genalm_model(input_ids=seqs, output_hidden_states=True,).hidden_states[-1]
        hidden_states=hidden_states[..., :1000, :]
        out1=genalm_decoder(hidden_states)
        out1_genalm=out1.squeeze(1).squeeze(0).cpu().detach().numpy()
        
        seq_list.append(out1_genalm)
        seq_list_numpy=np.array(seq_list)

        target_list_numpy=np.array(target_list)
        seq_list_tensor=torch.FloatTensor(seq_list_numpy)
        target_list_tensor=torch.FloatTensor(target_list_numpy)
        #calculate the 
        if i>=1:
            
            if i%1000==0:
                print(i)
                pearsonr=pr_auc(seq_list_tensor,target_list_tensor)
                print(pearsonr)
            

#plot the bar plot of the pearsonr


In [None]:
from transformers import AutoTokenizer,AutoModel,AutoModelForMaskedLM
import torch
from torch import nn


max_length=3000
with torch.no_grad():
    state_dict='weight/nt/nucleotide-transformer-v2-500m-multi-species'
    nt_tokenizer=AutoTokenizer.from_pretrained(state_dict, trust_remote_code=True)
    nt_model=AutoModelForMaskedLM.from_pretrained(state_dict, trust_remote_code=True).to('cuda')
    full_sequence=[]
    checkpoint=torch.load(NT_path)['state_dict']
    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
            checkpoint, "model.backbone."
        )
    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
            checkpoint, "decoder.0.output_transform."
        )

    nt_decoder = nn.Linear(1024,3).to('cuda')
    #edit key name in hyena_decoder
    
    nt_model.load_state_dict(checkpoint,strict=False)
    nt_decoder.load_state_dict(checkpoint,strict=False)
    nt_model.eval()
    nt_decoder.eval()

    target_list=[]
    seq_list=[]
    for i in range(len(all_seqs)):

        sequence_encoded=nt_tokenizer(all_seqs[i],
                            add_special_tokens= False,  # this is what controls adding eos
                            padding="max_length",
                            max_length=max_length,
                            truncation=True,
                        )
        seq_ids=sequence_encoded['input_ids']
        seq_ids = torch.LongTensor(seq_ids)
        target = all_labels[i][:1000]
        
        seqs=torch.reshape(seq_ids,(1,max_length)).to('cuda')
        target_list.append(target)
        hidden_states=nt_model(input_ids=seqs,output_hidden_states=True)['hidden_states'][-1]
        hidden_states=hidden_states[..., :1000, :]
        out1=nt_decoder(hidden_states)
        out1_nt=out1.squeeze(1).squeeze(0).cpu().detach().numpy()
        
        seq_list.append(out1_nt)
        seq_list_numpy=np.array(seq_list)

        target_list_numpy=np.array(target_list)
        seq_list_tensor=torch.FloatTensor(seq_list_numpy)
        target_list_tensor=torch.FloatTensor(target_list_numpy)
        #calculate the 
        if i>=1:
            
            if i%1000==0:
                print(i)
                pearsonr=pr_auc(seq_list_tensor,target_list_tensor)
                print(pearsonr)
            

#plot the bar plot of the pearsonr


In [None]:
from transformers import AutoTokenizer,AutoModel
import torch
from torch import nn


max_length=3000
with torch.no_grad():
    state_dict='weight/mamba/caduceus-ph_seqlen-131k_d_model-256_n_layer-16'
    mamba_tokenizer=AutoTokenizer.from_pretrained(state_dict, trust_remote_code=True)
    mamba_model=AutoModel.from_pretrained(state_dict, trust_remote_code=True).to('cuda')
    full_sequence=[]
    checkpoint=torch.load(mamba_path)['state_dict']
    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
            checkpoint, "model.backbone."
        )
    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
            checkpoint, "decoder.0.output_transform."
        )

    mamba_decoder = nn.Linear(256,3).to('cuda')
    #edit key name in hyena_decoder
    
    mamba_model.load_state_dict(checkpoint,strict=False)
    mamba_decoder.load_state_dict(checkpoint,strict=False)
    mamba_model.eval()
    mamba_decoder.eval()

    target_list=[]
    seq_list=[]
    for i in range(len(all_seqs)):

        sequence_encoded=mamba_tokenizer(all_seqs[i],
                            add_special_tokens= False,  # this is what controls adding eos
                            padding="max_length",
                            max_length=max_length,
                            truncation=True,
                        )
        seq_ids=sequence_encoded['input_ids']
        seq_ids = torch.LongTensor(seq_ids)
        target = all_labels[i][:1000]
        
        seqs=torch.reshape(seq_ids,(1,max_length)).to('cuda')
        target_list.append(target)
        hidden_states=mamba_model(seqs,output_hidden_states=True).last_hidden_state
        hidden_states=hidden_states[..., :1000, :]
        out1=mamba_decoder(hidden_states)
        out1_mamba=out1.squeeze(1).squeeze(0).cpu().detach().numpy()
        
        seq_list.append(out1_mamba)
        seq_list_numpy=np.array(seq_list)

        target_list_numpy=np.array(target_list)
        seq_list_tensor=torch.FloatTensor(seq_list_numpy)
        target_list_tensor=torch.FloatTensor(target_list_numpy)
        #calculate the 
        if i>=1:
            
            if i%1000==0:
                print(i)
                pearsonr=pr_auc(seq_list_tensor,target_list_tensor)
                print(pearsonr)
            

#plot the bar plot of the pearsonr
