# Prepping for real run. 

Todo: 
* Define `max_source_length` and `max_target_length` for the model (otherwise truncated).
padding token should be replaced with -100, which is the 'ignore_index' of `CrossEntorpyLoss` in PT and TF. For Flax, use `decoder_attention_mask`. 
Attention_mask. ensures madding tokens of inputs are ignored. 

* Install apex. "model will automatically use apex.normalization.FusedRMSNorm instead of T5LayerNorm." The former uses an optimized fused kernel which is several times faster than the latter.

A note on model sizes: 
T5-11B (original, not v1.1) weights in float32 are 45.2GB. 
See this post for using huggingface endpoints on SINGLE GPU for cheap inference: https://www.philschmid.de/deploy-t5-11b
Uses mixed precision and sharding, and LLM.int8(). 

In [6]:
import torch
import numpy as np
import tensorflow as tf
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Model, T5Config, AutoModelWithLMHead
import accelerate
import wandb
import math
import lovely_tensors as lt
lt.monkey_patch()
# !wandb login  -- reactivate later
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
from termcolor import colored, cprint
print(colored(f"👉 Starting to encode these text-captions: 69", "cyan", attrs=["reverse", "bold"]))

cprint("Attention!", "red", attrs=["bold"])

👉 Starting to encode these text-captions: 69
Attention!


In [2]:
'''
MODEL SELECTION

T5 V1.1 --  https://huggingface.co/docs/transformers/model_doc/t5v1.1 && https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511
small - base - large - 3b/xl - 11b/xxl

OG: t5-small

'google/t5-base-lm-adapt' # largest on my server (without float16)
'google/t5-xl-lm-adapt'

google/t5-v1_1-large
'''

# MODEL_SIZE = "t5-base"
MODEL_NAME = "google/t5-small-lm-adapt"
# config = T5Config.from_pretrained(MODEL_NAME)
t5 = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(device)
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, return_special_tokens_mask=True)
# low_cpu_mem_usage(bool, optional) — Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. experimental.

In [3]:
''' MODEL QUANTIZATION: fp16 '''

# XL worked on my server (tested both CPU and GPU).
# MODEL_NAME = "google/t5-base-lm-adapt"
# t5 = AutoModelWithLMHead.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(device)
# tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, return_special_tokens_mask=True)
# low_cpu_mem_usage(bool, optional) — Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. experimental.

''' int8 -- my GPU doesn't support it.. '''
# t5 = AutoModelWithLMHead.from_pretrained(MODEL_NAME, load_in_8bit=True, device_map='auto', low_cpu_mem_usage=True)

" int8 -- my GPU doesn't support it.. "

# Moc Dataset

In [7]:
''' PREP EMBEDDING INPUTS '''
# shape = (batch_size, 'words', embedding_dim) -- here 'words' == each of our embeddings, like clip and language.
# one_input_shape = [6, 512, 512]
one_input_shape = [1, 768, 768]
att_mask_shape = [1, 768]

decoder_input_embeds_arr = np.random.rand( *one_input_shape ).astype(np.float16) # need fp32
decoder_input_embeds_arr = torch.from_numpy(decoder_input_embeds_arr).to(device)
input_embeds_arr = np.random.rand( *one_input_shape ).astype(np.float16)
input_embeds_arr = torch.from_numpy(input_embeds_arr).to(device)
attn_mask_arr = np.ones( att_mask_shape )
attn_mask_arr = torch.from_numpy(attn_mask_arr).to(device)

print(decoder_input_embeds_arr)
print(input_embeds_arr)
print(attn_mask_arr)

''' Decoder gets the tokenized caption. Shape is (batch_size, max_caption_length). Use padding to make it fit. '''
# WORKING example, but easier with numpy.
# import torch.nn.functional as F
# decoder_input_ids = tokenizer("This is the target output sentence, aka the video caption. I like tacos because they are so delicious.", return_tensors="pt").input_ids.to(device)
# decoder_input_ids = F.pad(decoder_input_ids, (0, (512-decoder_input_ids.shape[1])), value=tokenizer.pad_token_id)
# print(decoder_input_ids.shape)
# decoder_input_ids

labels = tokenizer("The cute dog did the things", return_tensors="pt").input_ids.to(device)
# labels = torch.from_numpy(np.random.randint(1, 10_000, size=(one_input_shape[0], one_input_shape[2]))).to(device)
print(labels)
# labels = torch.cat((labels, torch.ones((1, 512-7), dtype=int).to(device)), dim=1)
# print(labels)

# labels = torch.from_numpy(np.random.randint(1, 10_000, size=(one_input_shape[0], one_input_shape[2]))).to(device)

tensor[1, 512, 512] f16 n=262144 x∈[5.901e-06, 1.000] μ=0.499 σ=inf cuda:0
tensor[1, 512, 512] f16 n=262144 x∈[1.252e-06, 1.000] μ=0.500 σ=inf cuda:0
tensor[1, 512, 512] f64 n=262144 x∈[1.000, 1.000] μ=1.000 σ=0. cuda:0
tensor[1, 7] i64 x∈[1, 5295] μ=1.130e+03 σ=1.940e+03 cuda:0 [[37, 5295, 1782, 410, 8, 378, 1]]


In [6]:

# Some shapes
# In transformer stack layer: 
# input embeds:  torch.Size([1, 512, 512])
# batch size, seq length:  1 512
# mask seq length:  512
# attnetion mask:  tensor[1, 512] x∈[1.000, 1.000] μ=1.000 σ=0. cuda:0
# past key values:  [None, None, None, None, None, None, None, None]
# extended attention mask:  tensor[1, 1, 1, 512] f16 [38;2;127;127;127mall_zeros[0m cuda:0
# head_mask:  [None, None, None, None, None, None, None, None]
# cross_attn_head_mask:  [None, None, None, None, None, None, None, None]
# present key value states:  None
# all hidden states:  None
# all attentions:  None
# all cross attentions:  None
# hidden states:  tensor[1, 512, 512] f16 n=262144 x∈[0., 1.111] μ=0.499 σ=inf cuda:0
# final hidden states:  tensor[1, 512, 512] f16 n=262144 x∈[-0.878, 0.658] μ=3.207e-05 σ=0.147 grad NativeDropoutBackward0 cuda:0
# encoder outputs:  BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor[1, 512, 512] f16 n=262144 x∈[-0.878, 0.658] μ=3.207e-05 σ=0.147 grad NativeDropoutBackward0 cuda:0, past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)
# hidden_states:  tensor[1, 512, 512] f16 n=262144 x∈[-0.878, 0.658] μ=3.207e-05 σ=0.147 grad NativeDropoutBackward0 cuda:0


# In transformer stack layer: 
# input embeds:  torch.Size([1, 7, 512])
# batch size, seq length:  1 7
# mask seq length:  7
# attnetion mask:  tensor[1, 7] x∈[1.000, 1.000] μ=1.000 σ=0. cuda:0 [[1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000]]
# encoder_seq_length:  512
# encoder attention mask:  tensor[1, 512] i64 x∈[1, 1] μ=1.000 σ=0. cuda:0
# past key values:  [None, None, None, None, None, None, None, None]
# extended attention mask:  tensor[1, 1, 7, 7] f16 n=49 x∈[-6.550e+04, 0.] μ=-2.808e+04 σ=inf cuda:0
# poggers
# head_mask:  [None, None, None, None, None, None, None, None]
# cross_attn_head_mask:  [None, None, None, None, None, None, None, None]
# present key value states:  ()
# all hidden states:  None
# all attentions:  None
# all cross attentions:  None
# hidden states:  tensor[1, 7, 512] f16 n=3584 x∈[-160.000, 80.562] μ=-0.429 σ=inf grad NativeDropoutBackward0 cuda:0
# final hidden states:  tensor[1, 7, 512] f16 n=3584 x∈[-28.641, 17.469] μ=0.020 σ=0.911 grad NativeDropoutBackward0 cuda:0

# sequence outputs:  tensor[1, 7, 512] f16 n=3584 x∈[-28.641, 17.469] μ=0.020 σ=0.911 grad NativeDropoutBackward0 cuda:0
# lm logits:  tensor[1, 7, 32128] f16 n=224896 x∈[-37.438, 6.746] μ=-13.016 σ=inf grad UnsafeViewBackward0 cuda:0
# tensor f16 grad NllLossBackward0 cuda:0 7.281  

## Train function

T5 forward() docs: https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5ForConditionalGeneration.forward

Todo: investigate difference between decoder `decoder_input_ids` and `lm_labels`.
For example: 
```
outputs = t5(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
```

I think `loss.sum()` is for multi-iteration loss. I was inadverdently using it 6 batches.
https://discuss.pytorch.org/t/loss-backward-raises-error-grad-can-be-implicitly-created-only-for-scalar-outputs/12152 
loss.backward() # T5 RuntimeError: grad can be implicitly created only for scalar outputs

Variables and their purpose:

`extended_attention_mask` - just the original attention mask provided, broadcast to a new shape (decoder uses causal mask, encoder uses regular mask)

`attention_mask` - attention mask used in the regular attention layer 

`encoder_attention_mask` - attention mask used in the cross-attention layer

`position_bias` - always None, position bias used in attention block

`encoder_decoder_position_bias` - always None, position bias used in the cross-attention layer

`layer_head_mask` - always None, masks certain heads in the attention block (T5Attention)

`cross_attn_layer_head_mask` - always None, masks certain heads in the attention block (T5Attention) and is used specifically for the cross-attention module in the decoder

`past_key_value` - always None, if specified then uses the previous key value states from the previous attention blocks in the query, key, value projection layers

`output_attentions` - True/False, and outputs the attention weights in addition to the rest of the outputs in the result tuple

In [8]:
t5.train()


# outputs = t5.forward(inputs_embeds=input_embeds_arr, attention_mask=attn_mask_arr, decoder_inputs_embeds=input_embeds_arr)
outputs = t5.forward(inputs_embeds=input_embeds_arr, attention_mask=attn_mask_arr, labels=labels)
# outputs = t5.forward(inputs_embeds=input_embeds_arr, labels=labels)
loss = outputs[0]
loss.shape
print(loss)

----------------------------------------------------------------------------------------------------
In transformer stack layer: 
input embeds:  torch.Size([1, 512, 512])
batch size, seq length:  1 512
mask seq length:  512
past key values:  [None, None, None, None, None, None, None, None]
extended attention mask:  tensor[1, 1, 512, 512] f16 [38;2;127;127;127mall_zeros[0m cuda:0
head_mask:  [None, None, None, None, None, None, None, None]
cross_attn_head_mask:  [None, None, None, None, None, None, None, None]
present key value states:  None
all hidden states:  None
all attentions:  None
all cross attentions:  None
hidden states:  tensor[1, 512, 512] f16 n=262144 x∈[0., 1.111] μ=0.500 σ=inf cuda:0
layer moduel:  T5Block(
  (layer): ModuleList(
    (0): T5LayerSelfAttention(
      (SelfAttention): T5Attention(
        (q): Linear(in_features=512, out_features=384, bias=False)
        (k): Linear(in_features=512, out_features=384, bias=False)
        (v): Linear(in_features=512, out_fea

RuntimeError: The size of tensor a (7) must match the size of tensor b (512) at non-singleton dimension 2

In [15]:
''' backwards pass '''
optimizer = torch.optim.Adam(params =  t5.parameters(), lr=1e-4)
optimizer.zero_grad()
loss.sum().backward()
optimizer.step()
print("✅ Successful training iteration")

✅ Successful training iteration


## Loss Objectives

Incorporate loss objectives: MLM, caption-image matching

In [20]:
# t5.train()

# input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids.to(device)
# labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids.to(device)

# loss = t5(input_ids=input_ids, labels=labels).loss
# loss.item()

# from transformers import T5Tokenizer, T5ForConditionalGeneration

# input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids.to(device)
# labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids.to(device)

input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
labels = tokenizer("Das Haus ist wunderbar.", return_tensors="pt").input_ids

print(input_ids)
print(labels)

print(input_ids[0][1].item())

print(tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").attention_mask)



# the forward function automatically creates the correct decoder_input_ids
loss = t5(input_ids=input_ids, labels=labels).loss
loss.item()

# outputs = t5.forward(inputs_embeds=input_embeds_arr, attention_mask=attn_mask_arr, decoder_inputs_embeds=input_embeds_arr)
# # outputs = t5.forward(inputs_embeds=input_embeds_arr, attention_mask=attn_mask_arr, labels=labels)
# loss = outputs[0]
# loss.shape
# print(loss)

tensor[1, 11] i64 x∈[1, 13959] μ=1.894e+03 σ=4.120e+03
tensor[1, 6] i64 x∈[1, 19250] μ=4.121e+03 σ=7.619e+03 [[644, 4598, 229, 19250, 5, 1]]
1566
tensor[1, 11] i64 x∈[1, 1] μ=1.000 σ=0.


ZeroDivisionError: division by zero

In [None]:
''' minimal train '''
t5.train()

loss = outputs[0]


optimizer = torch.optim.Adam(params =  model.parameters(), lr=1e-4)
optimizer.zero_grad()
loss.backward()
optimizer.step()

## Example from 

In [None]:
# Creating the training function. This will be called in the main function. It is run depending on the epoch value.
# The model is put into train mode and then we enumerate over the training loader and passed to the defined network 

def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
        loss = outputs[0]
        
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # xm.optimizer_step(optimizer)  # FOR TPU
        # xm.mark_step()                # FOR TPU

In [None]:
def main():
    # WandB – Initialize a new run
    wandb.init(project="transformers_tutorials_summarization")

    # WandB – Config is a variable that holds and saves hyperparameters and inputs
    # Defining some key variables that will be used later on in the training  
    config = wandb.config          # Initialize config
    config.TRAIN_BATCH_SIZE = 2    # input batch size for training (default: 64)
    config.VALID_BATCH_SIZE = 2    # input batch size for testing (default: 1000)
    config.TRAIN_EPOCHS = 2        # number of epochs to train (default: 10)
    config.VAL_EPOCHS = 1 
    config.LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
    config.SEED = 42               # random seed (default: 42)
    config.MAX_LEN = 512
    config.SUMMARY_LEN = 150 

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(config.SEED) # pytorch random seed
    np.random.seed(config.SEED) # numpy random seed
    torch.backends.cudnn.deterministic = True

    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    

    # Importing and Pre-Processing the domain data
    # Selecting the needed columns only. 
    # Adding the summarzie text in front of the text. This is to format the dataset similar to how T5 model was trained for summarization task. 
    df = pd.read_csv('./data/news_summary.csv',encoding='latin-1')
    df = df[['text','ctext']]
    df.ctext = 'summarize: ' + df.ctext
    print(df.head())

    
    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest will be used for validation. 
    train_size = 0.8
    train_dataset=df.sample(frac=train_size,random_state = config.SEED)
    val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    print("FULL Dataset: {}".format(df.shape))
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("TEST Dataset: {}".format(val_dataset.shape))


    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
    val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

    # Defining the parameters for creation of dataloaders
    train_params = {
        'batch_size': config.TRAIN_BATCH_SIZE,
        'shuffle': True,
        'num_workers': 0
        }

    val_params = {
        'batch_size': config.VALID_BATCH_SIZE,
        'shuffle': False,
        'num_workers': 0
        }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)


    
    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    model = model.to(device)

    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)

    # Log metrics with wandb
    wandb.watch(model, log="all")
    # Training loop
    print('Initiating Fine-Tuning for the model on our dataset')

    for epoch in range(config.TRAIN_EPOCHS):
        train(epoch, tokenizer, model, device, training_loader, optimizer)


    # Validation loop and saving the resulting file with predictions and acutals in a dataframe.
    # Saving the dataframe as predictions.csv
    print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
    for epoch in range(config.VAL_EPOCHS):
        predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
        final_df.to_csv('./models/predictions.csv')
        print('Output Files generated for review')

if __name__ == '__main__':
    main()

In [12]:
shuffle_prob = 0.4
shuffle_offset = 16
num_chunks_in_group = 4
batch_size = 1024
num_chunks = 16
B = batch_size * (num_chunks // num_chunks_in_group)

In [8]:

def calculate_temporal_loss(shuffled_idx_img, B, batch_size=6, num_chunks_in_group=23, hidden_size = 768):
  def gelu(input_tensor):
      """Gaussian Error Linear Unit.
      This is a smoother version of the RELU.
      Original paper: https://arxiv.org/abs/1606.08415
      Args:
        input_tensor: float Tensor to perform activation.
      Returns:
        `input_tensor` with the GELU activation applied.
      """
      # math.sqrt needed for bfloat16 compatibility
      cdf = 0.5 * (1.0 + tf.compat.v1.erf(input_tensor / math.sqrt(2.0)))
      return input_tensor * cdf


  def layer_norm(input_tensor, name=None, epsilon=1e-5):
    """Run layer normalization on the last dimension of the tensor."""
    name2use = f'LayerNorm_{name}' if name is not None else name
    with tf.compat.v1.variable_scope(name2use, default_name='LayerNorm'):
        dim = input_tensor.shape[-1]
        gamma = tf.compat.v1.get_variable('gamma', [dim], initializer=tf.constant_initializer(1))
        beta = tf.compat.v1.get_variable('beta', [dim], initializer=tf.constant_initializer(0))

        cast_up_to_float32 = input_tensor.dtype == tf.bfloat16
        if cast_up_to_float32:
            input_tensor = tf.cast(input_tensor, dtype=tf.float32)

        mean, variance = tf.compat.v1.nn.moments(input_tensor, -1, keep_dims=True)
        scale_factor = tf.compat.v1.rsqrt(variance + epsilon) * gamma
        input_tensor = input_tensor * scale_factor - mean * scale_factor + beta
        if cast_up_to_float32:
            input_tensor = tf.cast(input_tensor, dtype=tf.bfloat16)
    return input_tensor
  
  is_easy_viz = tf.reshape(tf.less(shuffled_idx_img, 64), [B, num_chunks_in_group])

  is_easy = tf.logical_and(is_easy_viz[:, :, None], is_easy_viz[:, None])
  label_w = tf.cast(tf.logical_not(is_easy), dtype=tf.float32) * 0.99 + 0.01

  label_w = tf.reshape(label_w, [-1])

  xa = tf.random.uniform(shape=[batch_size, num_chunks_in_group, hidden_size])

  print("xa shape: ", xa.shape)

  xb = tf.random.uniform(shape=[batch_size, num_chunks_in_group, hidden_size])

  print("xb shape: ", xb.shape)

  print("xa none shape: ", xa[:, :, None].shape)
  print("xb none shape: ", xb[:, None].shape)

  xa_tile = tf.tile(xa[:, :, None], [1, 1, num_chunks_in_group, 1])

  print("xa tile shape: ", xa_tile.shape)

  xa_tile = tf.reshape(xa_tile, [batch_size, num_chunks_in_group ** 2, hidden_size])

  print("xa tile shape 2: ", xa_tile.shape)


  xb_tile = tf.tile(xb[:, None], [1, num_chunks_in_group, 1, 1])

  print("xb tile shape: ", xb_tile.shape)

  xb_tile = tf.reshape(xb_tile, [batch_size, num_chunks_in_group ** 2, hidden_size])

  print("xb tile shape 2: ", xb_tile.shape)

  h_joint = tf.concat([xa_tile, xb_tile], 2)

  print("h_joint shape: ", h_joint.shape)

  h_joint = tf.reshape(h_joint, [batch_size * (num_chunks_in_group ** 2), hidden_size * 2])

  print("h_joint shape 2: ", h_joint.shape)


  # Now do the MLP
  h0 = tf.compat.v1.layers.dense(
      h_joint,
      hidden_size,
      kernel_initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.02),
      name='intermediate',
      activation=gelu,
  )

  print("h0 shape: ", h0.shape)

  h0_ln = layer_norm(h0, 'ln0')
  logits = tf.compat.v1.layers.dense(
      h0_ln,
      4,
      kernel_initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.02),
      name='logits',
  )

  print("logits shape: ", logits.shape)



  num_chunks_in_group = 23
  video_src_ids = tf.random.uniform(shape=[batch_size, num_chunks_in_group])

  xa_idx = tf.tile(tf.range(num_chunks_in_group)[:, None], [1, num_chunks_in_group])
  xb_idx = tf.tile(tf.range(num_chunks_in_group)[None], [num_chunks_in_group, 1])

  # print("xa_idx: ", xa_idx)
  # print("xb_idx: ", xb_idx)

  # 1 if identical
  is_identical = tf.cast(tf.equal(xa_idx, xb_idx), dtype=tf.int32)
  # 2 if less
  is_less = 2 * tf.cast(tf.less(xa_idx, xb_idx), dtype=tf.int32)
  # 3 if greater
  is_greater = 3 * tf.cast(tf.greater(xa_idx, xb_idx), dtype=tf.int32)


  # print("is_identical: ", is_identical)
  # print("is_less: ", is_less)
  # print("is_greater: ", is_greater)

  # print("sum: ", is_identical + is_less + is_greater)

  video_src_ids = tf.reshape(video_src_ids, [batch_size, num_chunks_in_group])

  print("video src ids: ", video_src_ids.shape)
  print(video_src_ids[:, None].shape)
  print(video_src_ids[:, :, None].shape)

  is_same_video = tf.equal(video_src_ids[:, None], video_src_ids[:, :, None])

  print("is_same_video: ", is_same_video.shape)

  # # 0 if not the same video
  labels = tf.compat.v1.where_v2(
    is_same_video,
    is_identical + is_less + is_greater,
    tf.zeros_like(is_identical),
  )
  labels = tf.reshape(labels, [(batch_size*num_chunks_in_group ** 2)])
  # print("labels: ", labels)


  num_classes = logits.shape[-1]

  one_hot_labels = tf.one_hot(labels, depth=num_classes, dtype=logits.dtype)
  print("one-hot labels: ", one_hot_labels.shape)

  cls_logprobs = tf.nn.log_softmax(logits, axis=-1)

  raw_loss = -tf.reduce_sum(cls_logprobs * one_hot_labels, axis=-1) *label_w
  temporal_loss = tf.reduce_mean(raw_loss)
  print("temporal loss: ", temporal_loss)

  return temporal_loss

calculate_temporal_loss()

xa shape:  (6, 23, 768)
xb shape:  (6, 23, 768)
xa none shape:  (6, 23, 1, 768)
xb none shape:  (6, 1, 23, 768)
xa tile shape:  (6, 23, 23, 768)
xa tile shape 2:  (6, 529, 768)
xb tile shape:  (6, 23, 23, 768)
xb tile shape 2:  (6, 529, 768)
h_joint shape:  (6, 529, 1536)
h_joint shape 2:  (3174, 1536)
h0 shape:  (3174, 768)
logits shape:  (3174, 4)
video src ids:  (6, 23)
(6, 1, 23)
(6, 23, 1)
is_same_video:  (6, 23, 23)
one-hot labels:  (3174, 4)
temporal loss:  tf.Tensor(1.6404991, shape=(), dtype=float32)


  h0 = tf.compat.v1.layers.dense(
  logits = tf.compat.v1.layers.dense(


<tf.Tensor: shape=(), dtype=float32, numpy=1.6404991>

In [24]:
shuffle_prob = 0.4
shuffle_offset = 16
num_chunks_in_group = 4
batch_size = 1024
num_chunks = 16
B = batch_size * (num_chunks // num_chunks_in_group)

def generate_shuffled_idx_img(B, shuffle_prob=0.4, shuffle_offset=16, num_chunks_in_group=4):
    # first index in num shuffle per group corresponds to 60% of the time, no shuffling done
    num_shuffle_per_group_probs = [1.0 - shuffle_prob, 1e-6] + [
        shuffle_prob / (num_chunks_in_group - 1) for i in range(num_chunks_in_group - 1)] 

    print("num shuffle per group: ", num_shuffle_per_group_probs)

    ev = sum([i * p for i, p in enumerate(num_shuffle_per_group_probs)])

    print("ev: ", ev)
    # tf.logging.info(
    #     "probs: {}\nExpected # of {}s out of place: {:.3f}".format(num_shuffle_per_group_probs, k, ev))
    nspg_logprob = tf.math.log(num_shuffle_per_group_probs)[None]

    print("nspg log prob: ", nspg_logprob)

    # scramble i frames at random
    num_shuffle_img = tf.squeeze(tf.random.categorical(nspg_logprob, dtype=tf.int32, num_samples=B), 0)
    print("num_shuffle_img: ", num_shuffle_img)

    # boolean tensor of shape [B, num_chunks_in_group] where each elt in B is of the form [True, False, True, False]
    # describing the frames that should be scrambled
    do_shuffle_img = tf.less(tf.argsort(tf.compat.v1.random_uniform([B, num_chunks_in_group]), 1),
                            num_shuffle_img[:, None])

    print("do_shuffle_img: ", do_shuffle_img)

    shuffled_idx_img = tf.where(
        do_shuffle_img,
        shuffle_offset + tf.argsort(tf.compat.v1.random_uniform([B, num_chunks_in_group]), 1),
        tf.tile(tf.range(num_chunks_in_group)[None], [B, 1]),
    )

    print("shuffled_idx_img: ", shuffled_idx_img)

    features_shuffled_idx_img = tf.reshape(shuffled_idx_img, [-1])
    print("features_shuffle_idx_img: ", features_shuffled_idx_img.shape)
    print(features_shuffled_idx_img)

    return features_shuffled_idx_img

features_shuffled_idx_img= generate_shuffled_idx_img(B)

num shuffle per group:  [0.6, 1e-06, 0.13333333333333333, 0.13333333333333333, 0.13333333333333333]
ev:  1.2000009999999999
nspg log prob:  tf.Tensor([[ -0.5108256 -13.815511   -2.014903   -2.014903   -2.014903 ]], shape=(1, 5), dtype=float32)
num_shuffle_img:  tf.Tensor([0 0 0 ... 0 3 0], shape=(4096,), dtype=int32)
do_shuffle_img:  tf.Tensor(
[[False False False False]
 [False False False False]
 [False False False False]
 ...
 [False False False False]
 [False  True  True  True]
 [False False False False]], shape=(4096, 4), dtype=bool)
shuffled_idx_img:  tf.Tensor(
[[ 0  1  2  3]
 [ 0  1  2  3]
 [ 0  1  2  3]
 ...
 [ 0  1  2  3]
 [ 0 19 18 17]
 [ 0  1  2  3]], shape=(4096, 4), dtype=int32)
features_shuffle_idx_img:  (16384,)
tf.Tensor([0 1 2 ... 1 2 3], shape=(16384,), dtype=int32)


In [29]:
def get_shape_list(tensor, expected_rank=None, name=None):
    """Returns a list of the shape of tensor, preferring static dimensions.
    Args:
      tensor: A tf.Tensor object to find the shape of.
      expected_rank: (optional) int. The expected rank of `tensor`. If this is
        specified and the `tensor` has a different rank, and exception will be
        thrown.
      name: Optional name of the tensor for the error message.
    Returns:
      A list of dimensions of the shape of tensor. All static dimensions will
      be returned as python integers, and dynamic dimensions will be returned
      as tf.Tensor scalars.
    """
    if name is None and not tf.executing_eagerly():
        name = tensor.name

    # if expected_rank is not None:
    #     assert_rank(tensor, expected_rank, name)

    shape = tensor.shape.as_list()

    non_static_indexes = []
    for (index, dim) in enumerate(shape):
        if dim is None:
            non_static_indexes.append(index)

    if not non_static_indexes:
        return shape

    dyn_shape = tf.shape(tensor)
    for index in non_static_indexes:
        shape[index] = dyn_shape[index]
    return shape

    
def one_hot_gather(x, idx):
    """
    Does a one-hot gather on a single axis, 0
    :param x: [N, H] tensor with a float dtype
    :param idx: 1 dimensional int32 with indices 0...N
    :return:
    """
    N, H = get_shape_list(x, 2)
    get_shape_list(idx, 1)
    idx_oh = tf.one_hot(idx, depth=N, dtype=tf.bfloat16 if x.dtype == tf.bfloat16 else tf.float32)
    return tf.matmul(idx_oh, x)

def position_embedder2d(num_h, num_w, embedding_size, name='pos_embs', num_img=1, max_position_embeddings=64,
                        max_nimg=4, num_cls_emb=1, initializer_range=0.02):
    """
    This is the same as a 2D pos emb BUT easier to change?
    :param num_h:
    :param num_w:
    :param embedding_size:
    :param name:
    :param num_img:
    :param max_position_embeddings:
    :param max_nimg:
    :param initializer_range:
    :return: [num_img * (1 + num_h * num_w), embedding_size] pos emb
    """
    with tf.compat.v1.variable_scope(name):
        pos_embs_3d = tf.compat.v1.get_variable(
            name='pos_embs',
            shape=[max_nimg, max_position_embeddings, max_position_embeddings, embedding_size],
            initializer=tf.compat.v1.truncated_normal_initializer(stddev=initializer_range),
        )
        cls_embs = tf.compat.v1.get_variable(
            name='cls_emb',
            shape=[max_nimg, num_cls_emb, embedding_size],
            initializer=tf.compat.v1.truncated_normal_initializer(stddev=initializer_range),
        ) if num_cls_emb > 0 else None

        full_pe = tf.reshape(pos_embs_3d[:num_img, :num_h, :num_w], [num_img, num_h * num_w, embedding_size])
        if num_cls_emb > 0:
            full_pe = tf.concat([cls_embs[:num_img], full_pe], 1)
        return tf.reshape(full_pe, [num_img * (num_cls_emb + num_h * num_w), embedding_size])


def position_embedder(seq_length, name, max_position_embeddings, embedding_size, offset=0,
                      initializer_range=0.02):
    """
    :param seq_length: Length of the sequence to position embed. Must be less than max_position_embeddings.
    :param name: Name of the embedding
    :param max_position_embeddings: Highest it'll go
    :param embedding_size: dimension to map to
    :param offset: Currently this isn't supported but it's so you can deal with caching. In that case
                   we don't want to run all the old sequences through the transformer
    :param initializer_range: for truncated normal initializer
    :return:
    """
    # Do something special for position embeddings
    assert_op = tf.compat.v1.assert_less_equal(seq_length, max_position_embeddings)
    with tf.control_dependencies([assert_op]):
        full_position_embeddings = tf.compat.v1.get_variable(
            name=name,
            shape=[max_position_embeddings, embedding_size],
            initializer=tf.compat.v1.truncated_normal_initializer(stddev=initializer_range),
        )

        # Since the position embedding table is a learned variable, we create it
        # using a (long) sequence length `max_position_embeddings`. The actual
        # sequence length might be shorter than this, for faster training of
        # tasks that do not have long sequences.
        #
        # So `full_position_embeddings` is effectively an embedding table
        # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
        # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
        # perform a slice.
        if offset == 0:
            position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1])[None]
        else:
            # Tensorflow is too stupid to allow slicing
            flat_pos_ids = (tf.range(seq_length, dtype=tf.int32) + offset)
            one_hot_pos_ids = tf.one_hot(flat_pos_ids, depth=max_position_embeddings)

            # [seq_length, full_position_embeddings], [full_position_embeddings, dim]
            seq_embeds = tf.matmul(one_hot_pos_ids, full_position_embeddings)
            position_embeddings = seq_embeds[None]

    return position_embeddings, full_position_embeddings


def vision_pos_emb(shuffled_idx_img=None, B=4096, num_chunks_in_group=4, hidden_size=768, num_h=16, num_w=16, viz_chunk_length=257, P=16, num_imgs=1, max_vision_pos_embeddings=32):
    """
    Add only image level position embeddings
    :param shuffled_idx_img: If not None, then we will do shuffle the input
    :return:
    """
    # num_chunks_in_group = 4
    # hidden_size = 768
    my_pe, img_pe_table = position_embedder(
        num_chunks_in_group * num_imgs,
        name='img_idx_pe',
        max_position_embeddings=max_vision_pos_embeddings,
        embedding_size=hidden_size,
        initializer_range=0.02,
    )
    if shuffled_idx_img is None:
        # tf.logging.info("NOT shuffling the vision input! this is probably what you want for downstream")
        my_pe = tf.tile(my_pe[:, :, None], [1, 1, viz_chunk_length, 1])
        my_pe = tf.reshape(my_pe, [1, P * num_imgs, hidden_size])
    else:
        # tf.logging.info("!!!shuffling the vision input!!!!")
        # Idk how to handle these things
        assert num_imgs == 1
        # assert self.num_texts == 1
        my_pe = one_hot_gather(img_pe_table, tf.reshape(shuffled_idx_img, [-1]))
        my_pe = tf.tile(my_pe[:, None], [1, viz_chunk_length, 1])
        my_pe = tf.reshape(my_pe, [B, P, hidden_size])

    # add extra position embeddings, since even though the vision transformer had position
    # embeddings we did an avgpool so they might have gotten washed out
    image_pe2d = position_embedder2d(num_h=num_h,
                                        num_w=num_w,
                                        embedding_size=hidden_size,
                                        num_img=1,
                                        num_cls_emb=1,
                                        max_nimg=1,
                                        initializer_range=0.02,
                                        name='final_pe',
                                        )
    my_pe += tf.tile(image_pe2d, [num_chunks_in_group * num_imgs, 1])[None]
    return my_pe


P = 16
h0 = 256
w0 = 256
h1 = h0 // P
w1 = w0 // P
vision_pos_emb(shuffled_idx_img=features_shuffled_idx_img)

2022-12-04 18:18:30.168002: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 12935233536 exceeds 10% of free system memory.


: 

: 