In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import PreTrainedTokenizerFast

tensor = torch.tensor([[1,1,1,0,1,1,1,1,0,0],[1,1,1,0,1,1,0,0,0,0]], dtype=torch.int64)
print(tensor)

tensor = -tensor.to(torch.float32)
tensor = F.max_pool1d(tensor, kernel_size=2, stride=2, padding=0)
tensor = -tensor.to(torch.int64)
print(tensor)

tensor([[1, 1, 1, 0, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 0, 1, 1, 0, 0, 0, 0]])
tensor([[1, 0, 1, 1, 0],
        [1, 0, 1, 0, 0]])


In [2]:
tensor = torch.tensor([[1,0,0,1,0,0],[1,0,1,0,0,1]], dtype=torch.int64)
print(tensor)

tensor = tensor.to(torch.float32).unsqueeze(1)
tensor = F.interpolate(tensor, scale_factor=2, mode="nearest")
tensor = tensor.to(torch.int64).squeeze(1)
print(tensor)

tensor([[1, 0, 0, 1, 0, 0],
        [1, 0, 1, 0, 0, 1]])
tensor([[1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
        [1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1]])


In [3]:
tokenizer = PreTrainedTokenizerFast.from_pretrained('/home/kkj/ProtDiffusion/ProtDiffusion/tokenizer/tokenizer_v4.1')

OSError: Incorrect path_or_model_id: '/home/kkj/ProtDiffusion/ProtDiffusion/tokenizer/tokenizer_v4.1'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [None]:
tokenized = tokenizer('-[ACDFGDIGDE]---',
                        padding=True,
                        truncation=False, # We truncate the sequences beforehand
                        return_token_type_ids=False,
                        return_attention_mask=True, # We need to attend to padding tokens, so we set this to False
                        return_tensors="pt",
)
print(tokenized)

In [None]:
print(tokenized['attention_mask'].dtype)

In [None]:
import random

random.randint(0,0)

In [21]:
import numpy as np

def round_length(length: int, pad: int = 2, rounding: int = 16) -> int:
    '''
    Round the length to the nearest multiple of 16.
    '''
    return int(np.ceil((length + pad) / rounding) * rounding)

def process_sequence(sequence: str,
                     bos_token: str = "[",
                     eos_token: str = "]",
                     pad_token: str = "-",
) -> str:
    '''
    Process the sequence by adding the bos and eos tokens, and padding it to a multiple of 16 (or what the variable is set to in the round_kength).
    Return the sequence and the length of the sequence.
    '''
    seq_len = round_length(len(sequence))
    sequence = bos_token + sequence + eos_token
    len_diff = seq_len - len(sequence)
    rand_int = random.randint(0, len_diff)
    sequence = pad_token * rand_int + sequence + pad_token * (len_diff - rand_int)

    return sequence

In [None]:
process_sequence('ACDFGDIGDEIGH')

In [None]:
from ProtDiffusion.models.dit_transformer_1d import DiTTransformer1DModel
from ProtDiffusion.training_utils import count_parameters

model = DiTTransformer1DModel(
    num_attention_heads = 8,
    attention_head_dim = 72,
    in_channels = 64,
    num_layers = 8,
    attention_bias = True,
    activation_fn = "gelu-approximate",
    num_classes = 2,
    upcast_attention = False,
    norm_type = "ada_norm_zero",
    norm_elementwise_affine = False,
    norm_eps = 1e-5,
    pos_embed_type = "sinusoidal", # sinusoidal
    num_positional_embeddings = 1024,
    use_rope_embed = True, # RoPE https://github.com/lucidrains/rotary-embedding-torch
).to('cuda')
count_parameters(model)
model.train()

In [None]:
x = torch.randn(16, 64, 1008).to('cuda') # Batch size, in channels, sequence length, max length is 1024
m = torch.randint(0, 2, (16, 1008), dtype=torch.bool).to('cuda')
t = torch.randint(0, 1000, (16,), dtype=torch.int64).to('cuda') # Timesteps, any int is valid?
cl = torch.randint(0, 2, (16,), dtype=torch.int64).to('cuda') # Classifier labels, 0 and 1 are the only valid labels, 2 is a dropped label

out = model(x, m, t, cl)
print(out.sample.shape)

In [None]:
from ProtDiffusion.models.pipeline_protein import ProtDiffusionPipeline
from ProtDiffusion.models.autoencoder_kl_1d import AutoencoderKL1D
from diffusers.schedulers import DDPMScheduler

vae_ce = AutoencoderKL1D.from_pretrained('/home/kkj/ProtDiffusion/output/protein-VAE-UniRef50_v18.1/pretrained/CE')
vae_ema = AutoencoderKL1D.from_pretrained('/home/kkj/ProtDiffusion/output/protein-VAE-UniRef50_v18.1/pretrained/EMA')

scheduler = DDPMScheduler()

pipeline = ProtDiffusionPipeline(
    transformer=model,
    vae=vae_ema,
    scheduler=scheduler,
    tokenizer=tokenizer,
).to('cuda')

out1 = pipeline(seq_len=[64,64], 
               class_labels=[0,0], 
               guidance_scale=4.0,
               num_inference_steps=2,
               generator=torch.Generator().manual_seed(42),
               output_type='aa_seq',
).seqs
out2 = pipeline(seq_len=[64,256], 
               class_labels=[0,0], 
               guidance_scale=4.0,
               num_inference_steps=10,
               generator=torch.Generator().manual_seed(42),
               output_type='aa_seq',
).seqs


In [None]:
for seq in out1:
    print(seq)
for seq in out2:
    print(seq)

In [None]:
latents = torch.randn(2, 64, 8).to('cuda')
attention_mask = torch.tensor([[1,1,1,1,1,1,1,1],[1,1,1,1,0,0,0,0]]).to('cuda')
latents = latents * attention_mask.unsqueeze(1)
print(latents[:,0,:])
vae_ema = vae_ema.to('cuda')
decoded = vae_ema.decode(latents, attention_mask)

In [None]:
from ProtDiffusion.training_utils import logits_to_token_ids

token_ids = logits_to_token_ids(decoded.sample, tokenizer)
output = tokenizer.batch_decode(token_ids)
seqs = output
for seq in seqs:
    print(seq)

In [10]:
import torch
one = torch.randn(2, 64, 8)
two = torch.tensor([[1,1,1,1,1,1,1,1],[1,1,1,1,1,1,0,0]], dtype=torch.bool)

In [11]:
two = two.unsqueeze(1).expand_as(one)
one[two].shape

torch.Size([896])

In [4]:
10 // 6

1