In [89]:
import setGPU
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

# Check if CUDA is available, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

def generate_data(size, ranges):
    data = []
    for _range in ranges:
        data.append(np.random.uniform(_range[0], _range[1], size // len(ranges)))
    return torch.tensor(np.concatenate(data), dtype=torch.float32).to(device)

class CustomAutoencoder(nn.Module):
    def __init__(self, M, N, L, P, K):
        super(CustomAutoencoder, self).__init__()
        self.matrix_MN = nn.Parameter(torch.randn(M, N))
        self.coefficient_net = nn.Sequential(
            nn.Linear(1, M//4),
            nn.ReLU(),
            nn.Linear(M//4, M//2),
            nn.ReLU(),
            nn.Linear(M//2, M)
        )

        self.encoder = nn.Sequential(
            nn.Linear(N, N),
            nn.ReLU(),
            nn.Linear(N, N),
            nn.ReLU(),
            nn.Linear(N, K)
        )
        
        self.matrix_LP = nn.Parameter(torch.randn(L, P))
        self.selector = nn.Sequential(
            nn.Linear(K, K),  
            nn.ReLU(),
            nn.Linear(K, K),
            nn.ReLU(),
            nn.Linear(K, L),
            )
        
        
        self.dynamic_mlp = nn.Sequential(
            nn.Linear(P + K, 100),  
            nn.ReLU(),
            nn.Linear(100, 50),  
            nn.ReLU(),
            nn.Linear(50, 1)
        )
        self.temperature = 5.0


    def forward(self, x, temperature=0.5):
        # Ensure x is batched properly
        x = x.view(-1, 1)  # Correcting shape for batch processing
        coeffs = self.coefficient_net(x)
        coeffs = F.softmax(coeffs, dim=1)
        # Ensure matrix multiplication compatibility
        x_encoded = torch.matmul(coeffs, self.matrix_MN)  # coeffs [batch_size, M], matrix_MN [M, N]

        x_bottleneck = self.encoder(x_encoded)
        
        logits = self.selector(x_bottleneck)
        if self.training:
            # Use Gumbel-Softmax during training for differentiability
            probs = F.gumbel_softmax(logits, tau=self.temperature, hard=True, dim=-1)
        else:
            # Use argmax during evaluation for deterministic results
            _, max_indices = torch.max(logits, dim=1)
            probs = torch.zeros_like(logits).scatter_(1, max_indices.unsqueeze(1), 1)
        selected_row = torch.matmul(probs, self.matrix_LP)  # probs [batch_size, L], matrix_LP [L, P]
        mlp_input = torch.cat((selected_row, x_bottleneck), dim=1)
        x_reconstructed = self.dynamic_mlp(mlp_input)
        
        return x_reconstructed, probs
    
    def update_temperature(self, epoch, total_epochs, minimum_temperature=0.01):
        # Linear annealing
        self.temperature = max(minimum_temperature, self.temperature - (self.temperature - minimum_temperature) * (epoch / total_epochs))

# Hyperparameters
M = 100
N = 100
K = 100
L = 1
P = 100

# Model, Optimizer, Loss
model = CustomAutoencoder(M, N, L, P, K).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Training data preparation
train_size = 10000
input_ranges = [(-100, -10), (-0.5, 0.5), (10, 20), (10, 100)]
inputs = generate_data(train_size, input_ranges).unsqueeze(1)  # Reshape for batch processing
targets = inputs.clone()  # Targets are the same as inputs

# Training loop
num_epochs = 1000
batch_size = 1000
for epoch in range(num_epochs):
    inputs = generate_data(batch_size, input_ranges).unsqueeze(1)
    targets = inputs.clone()
    batch_inputs = inputs
    batch_targets = targets

    optimizer.zero_grad()
    outputs, gumbel_probs = model(batch_inputs)
    model.update_temperature(epoch, num_epochs)
    loss = 100 * criterion(outputs, batch_targets)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

test_size = 1000
test_inputs = generate_data(test_size, input_ranges).unsqueeze(1)

# Testing
with torch.no_grad():
    model.eval()
    test_outputs, gumbel_probs= model(test_inputs)
    test_loss = criterion(test_outputs, test_inputs)
#     print("Test outputs:", test_outputs)
    print("Test Loss:", test_loss.item())


Using device: cuda
Epoch 1, Loss: 188017.1875
Epoch 11, Loss: 192212.6875
Epoch 21, Loss: 166953.234375
Epoch 31, Loss: 121098.765625
Epoch 41, Loss: 91882.65625
Epoch 51, Loss: 34399.6015625
Epoch 61, Loss: 20037.244140625
Epoch 71, Loss: 2647.69677734375
Epoch 81, Loss: 847.4603881835938
Epoch 91, Loss: 245.80917358398438
Epoch 101, Loss: 120.67447662353516
Epoch 111, Loss: 76.60591125488281
Epoch 121, Loss: 75.31448364257812
Epoch 131, Loss: 58.4202995300293
Epoch 141, Loss: 36.890445709228516
Epoch 151, Loss: 27.06822967529297
Epoch 161, Loss: 26.44301986694336
Epoch 171, Loss: 25.467485427856445
Epoch 181, Loss: 17.445430755615234
Epoch 191, Loss: 16.05217933654785
Epoch 201, Loss: 12.730245590209961
Epoch 211, Loss: 13.048492431640625
Epoch 221, Loss: 10.823512077331543
Epoch 231, Loss: 12.70936393737793
Epoch 241, Loss: 7.634136199951172
Epoch 251, Loss: 7.192852973937988
Epoch 261, Loss: 6.437993049621582
Epoch 271, Loss: 6.107530117034912
Epoch 281, Loss: 5.287620544433594
Epo

In [124]:
import torch.nn.functional as F
x = F.gumbel_softmax(torch.tensor([1,1,0.5],device = 'cuda', dtype=torch.half), tau=1, hard=True)

In [125]:
x.dtype

torch.float16

In [90]:
import random
index = random.sample(range(len(test_inputs)), 5)

In [91]:
test_inputs[index]

tensor([[12.9407],
        [43.5708],
        [65.2328],
        [81.1902],
        [16.3516]], device='cuda:0')

In [92]:
test_outputs[index]

tensor([[12.8965],
        [43.5963],
        [65.3737],
        [81.5181],
        [16.2908]], device='cuda:0')

In [88]:
gumbel_probs[index]

tensor([[0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0.]], device='cuda:0')

In [33]:
gumbel_probs[-2:]

tensor([[0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 1.]], device='cuda:0')

In [144]:
x.gather(index = indices, dim = 1)

tensor([[ 1],
        [ 6],
        [11],
        [14],
        [17]])

In [142]:
x

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15],
        [16, 17, 18, 19]])

In [89]:
list(x.parameters())[0][-10:].requires_grad

True

In [75]:
x.weight[-10:].requires_grad

False

In [48]:
# Define the tensor
tensor = torch.LongTensor([3236502, 3236501])

# Compute the integer parts
integer_parts = tensor // 100  # Integer division

# Compute the fractional parts
fractional_parts = (tensor % 100) / 100  # Modulus operation followed by division


In [49]:
fractional_parts

tensor([0.0200, 0.0100])

In [10]:
import sys
sys.path.append('../')
import transformers
from videollama2 import conversation as conversation_lib
from videollama2.constants import NUM_FRAMES, IGNORE_INDEX, MMODAL_TOKEN_INDEX, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN
from videollama2.videollama2_trainer import VideoLLaMA2Trainer
from videollama2.model import *
import torch
config = transformers.AutoConfig.from_pretrained('../checkpoints/VideoLLaMA2-7B', trust_remote_code=True)
config._attn_implementation = None
config.mm_use_time_token = True
config.float_token_id_start = 0
config.float_token_id_end = 1


            
model = Videollama2MistralForCausalLM.from_pretrained(
    '../checkpoints/VideoLLaMA2-7B',
    config=config,
    torch_dtype=torch.bfloat16,
    do_sample=True,
)


2024-07-02 08:56:39.105694: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-02 08:56:39.105769: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-02 08:56:39.105791: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-02 08:56:39.111782: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of the model checkpoint at ../checkpoints/VideoLLaMA2-7B were not used when initializing Videollama2MistralForCausalLM: ['model.vision_tower.vision_tower.vision_model.embeddings.class_embedding', 'model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight', 'model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.la

Some weights of Videollama2MistralForCausalLM were not initialized from the model checkpoint at ../checkpoints/VideoLLaMA2-7B and are newly initialized: ['time_mlp.linear1.bias', 'time_mlp.linear1.weight', 'time_mlp.linear2.bias', 'time_mlp.linear2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
model.resize_token_embeddings(40000)

Embedding(40000, 4096)

In [14]:
model.get_model().config.vocab_size

40000

In [145]:
x = torch.tensor([1.3000e+01, 1.0160e+03, 1.8720e+03, 3.7920e+03, 7.8080e+03, 7.8080e+03,
        1.3376e+04, 2.8800e+04, 2.8672e+04, 1.3000e+01, 1.0368e+04, 9.3440e+03,
        2.8200e+02, 2.0000e+03, 1.8720e+03, 1.7664e+04, 2.2656e+04, 6.3600e+02,
        3.4800e+02, 2.8800e+04, 1.9584e+04, 2.8800e+04, 2.8672e+04, 3.2000e+04,
        3.2256e+04, 3.2256e+04, 3.2256e+04, 3.2256e+04, 3.2256e+04, 3.2256e+04,
        3.2256e+04, 3.2384e+04, 1.3000e+01, 6.1120e+03, 2.8672e+04, 1.3760e+03,
        2.8800e+04, 2.8672e+04, 3.2000e+04, 3.2128e+04, 3.2128e+04, 3.2128e+04,
        3.2128e+04, 3.2128e+04, 3.2128e+04, 3.2128e+04, 3.2384e+04, 1.3000e+01,
        4.0320e+03, 7.7120e+03, 3.5200e+02, 2.8800e+04, 2.8672e+04, 3.2000e+04,
        3.2128e+04, 3.2128e+04, 3.2128e+04, 3.2128e+04, 3.2128e+04, 3.2128e+04,
        3.2128e+04, 3.2384e+04, 1.3000e+01, 1.9712e+04, 2.8800e+04, 2.8672e+04,
        3.2000e+04, 3.2128e+04, 3.2128e+04, 3.2128e+04, 3.2128e+04, 3.2128e+04,
        3.2256e+04, 3.2256e+04, 3.2384e+04, 1.3000e+01, 3.2000e+03, 3.4800e+02,
        2.7200e+02, 2.9920e+03, 3.0200e+02, 2.4576e+04, 1.2560e+03, 2.8800e+04,
        7.3200e+02, 2.8800e+04, 1.6320e+04, 2.8800e+04, 4.1600e+02, 1.2560e+03,
        1.3056e+04, 1.0000e+03, 2.8672e+04, 2.0000e+00, 7.3200e+02, 1.6320e+04,
        2.8800e+04, 4.3200e+03, 1.2320e+03, 2.7200e+02, 2.4576e+04, 1.2560e+03,
        2.5440e+03, 4.5600e+02, 2.8800e+04, 7.3200e+02, 2.8800e+04, 1.6320e+04,
        2.8800e+04, 1.0960e+03, 3.7800e+02, 2.8800e+04, 2.8672e+04, 2.4960e+03,
        2.6400e+02, 1.1040e+03, 1.5280e+03, 2.8672e+04, 2.8672e+04, 2.0000e+00,
        7.3200e+02, 1.6320e+04, 2.8800e+04, 1.9072e+04, 8.4800e+02, 2.7200e+02,
        2.6080e+03, 7.5200e+03, 3.5400e+02, 1.6800e+03, 4.0960e+03, 2.8672e+04,
        7.3200e+02, 2.8800e+04, 1.6320e+04, 2.8800e+04, 1.9584e+04, 2.8800e+04,
        2.8672e+04, 3.2000e+04, 3.2256e+04, 3.2384e+04, 1.9712e+04, 2.8800e+04,
        2.8672e+04, 3.2000e+04, 3.2256e+04, 3.2384e+04, 2.8672e+04, 2.0000e+00,
        0.0000e+00, 0.0000e+00], device='cuda:0', dtype=torch.bfloat16)

In [1]:
import torch
x1 = torch.tensor([32334.2], dtype = torch.float32)

In [9]:
torch.tensor([31990]).half()

tensor([31984.], dtype=torch.float16)

In [142]:
(32001 <= x) & (x <= 32384)

tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False,  True,  True,  True,  True,  True,  True,  True,
         True,  True, False, False, False, False, False, False,  True,  True,
         True,  True,  True,  True,  True,  True,  True, False, False, False,
        False, False, False,  True,  True,  True,  True,  True,  True,  True,
         True,  True, False, False, False, False,  True,  True,  True,  True,
         True,  True,  True,  True,  True, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, 

In [136]:
32001<=x

tensor([1.3000e+01, 1.0160e+03, 1.8720e+03, 3.7920e+03, 7.8080e+03, 7.8080e+03,
        1.3376e+04, 2.8800e+04, 2.8672e+04, 1.3000e+01, 1.0368e+04, 9.3440e+03,
        2.8200e+02, 2.0000e+03, 1.8720e+03, 1.7664e+04, 2.2656e+04, 6.3600e+02,
        3.4800e+02, 2.8800e+04, 1.9584e+04, 2.8800e+04, 2.8672e+04, 3.2000e+04,
        3.2256e+04, 3.2256e+04, 3.2256e+04, 3.2256e+04, 3.2256e+04, 3.2256e+04,
        3.2256e+04, 3.2384e+04, 1.3000e+01, 6.1120e+03, 2.8672e+04, 1.3760e+03,
        2.8800e+04, 2.8672e+04, 3.2000e+04, 3.2128e+04, 3.2128e+04, 3.2128e+04,
        3.2128e+04, 3.2128e+04, 3.2128e+04, 3.2128e+04, 3.2384e+04, 1.3000e+01,
        4.0320e+03, 7.7120e+03, 3.5200e+02, 2.8800e+04, 2.8672e+04, 3.2000e+04,
        3.2128e+04, 3.2128e+04, 3.2128e+04, 3.2128e+04, 3.2128e+04, 3.2128e+04,
        3.2128e+04, 3.2384e+04, 1.3000e+01, 1.9712e+04, 2.8800e+04, 2.8672e+04,
        3.2000e+04, 3.2128e+04, 3.2128e+04, 3.2128e+04, 3.2128e+04, 3.2128e+04,
        3.2256e+04, 3.2256e+04, 3.2384e+

In [8]:
class FlattenHead(nn.Module):
    def __init__(self, input_dim = 4096, hidden_dim = 4096, dropout_rate=0.1):
        super(FlattenHead, self).__init__()
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        nn.init.xavier_uniform_(self.linear1.weight, gain=nn.init.calculate_gain('relu'))
        self.dropout = nn.Dropout(dropout_rate)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_dim, 1) 
        nn.init.xavier_uniform_(self.linear2.weight, gain=nn.init.calculate_gain('sigmoid'))
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        x = self.sigmoid(x)
        return x

In [9]:
model = FlattenHead()

In [10]:
inputs = torch.ones(5, 10, 4096)
model(inputs).shape

torch.Size([5, 10, 1])

In [1]:
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained(
    '../checkpoints/VideoLLaMA2-7B',
    padding_side="right",
    use_fast=True,
)

In [2]:
tokenizer.tokenize('10.0, -8.5')

['▁', '1', '0', '.', '0', ',', '▁-', '8', '.', '5']

In [2]:
from transformers import AutoTokenizer
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained(
    '../checkpoints/VideoLLaMA2-7B',
    padding_side="right",
    use_fast=True,
)
# Print the initial length of the tokenizer
print("Initial tokenizer length:", len(tokenizer))

# Add the special token
special_tokens_dict = {'additional_special_tokens': ['<t_start>', '<t_end>']}
tokenizer.add_special_tokens(special_tokens_dict)

# Print the length of the tokenizer after adding the special token
print("New tokenizer length:", len(tokenizer))


Initial tokenizer length: 32000
New tokenizer length: 32002


In [24]:
original_tokenizer = AutoTokenizer.from_pretrained(
    '../checkpoints/VideoLLaMA2-7B',
    padding_side="right",
    use_fast=False
)

# # Now wrap it with your custom tokenizer
# time_tokenizer = TimeTokenizer(original_tokenizer)

In [65]:
from transformers import AutoTokenizer, PreTrainedTokenizerFast
import numpy as np
import bisect
import re
import torch

class TimeTokenizer:
    def __init__(self, tokenizer, range_min=-50, range_max=50, bins=25):
        
        # Store the original tokenizer, the legacy has to be set to False for correct decoding
        tokenizer.legacy = False
        self._tokenizer = tokenizer
        
        # Generate and add new tokens based on the given range and bin size
        self.range_min = range_min
        self.range_max = range_max
        self.bins = bins
        self.range_tokens = [f"{x:.2f}" for x in np.linspace(range_min, range_max, bins+1)]
        
        vocab = tokenizer.get_vocab()
        if '<t_start>' not in vocab:
            tokenizer.add_tokens(['<t_start>'])
            tokenizer.add_tokens(self.range_tokens)
            tokenizer.add_tokens(['<t_end>'])
            self.num_new_tokens = len(self.range_tokens) + 2
        else:
            self.num_new_tokens = 0
            
        # Cache necessary token ids
        self.time_id_start = tokenizer.convert_tokens_to_ids('<t_start>')
        self.float_token_id_start = tokenizer.convert_tokens_to_ids(self.range_tokens[0])
        self.float_token_id_end = tokenizer.convert_tokens_to_ids(self.range_tokens[-1])
        self.time_id_end = tokenizer.convert_tokens_to_ids('<t_end>')
        
        self.sorted_tokens = torch.tensor(sorted(float(val) for val in self.range_tokens), dtype=torch.float32)
        self.float_to_token = {float(val): val for val in self.range_tokens}
        self.bos_token_id = tokenizer.bos_token_id
        self.pad_token_id = tokenizer.pad_token_id
        self.unk_token = tokenizer.unk_token
        self.model_max_length = tokenizer.model_max_length
        self.padding_side = tokenizer.padding_side
        
    def __call__(self, text, **kwargs):
        
        return_tensors = kwargs.pop('return_tensors', None)  # Default to False if not specified
            
        # Encoding the text
        new_input_ids = self.encode(text, **kwargs)
        
        # Generating an attention mask that matches the new input IDs
        attention_mask =[1] * len(new_input_ids)
        if return_tensors == 'pt':
            new_input_ids = torch.tensor(new_input_ids, dtype=torch.long)
            
        return SimpleNamespace(**{'input_ids': new_input_ids, 'attention_mask': None})

    def __len__(self):
        return len(self._tokenizer)
    
    def encode(self, text, **kwargs):

        # Split text into parts to handle both special tags and general text
        parts = re.split(r'(<t_start>|<t_end>)', text)
        new_input_ids = []

        add_special_tokens = kwargs.get('add_special_tokens', True)  # Default to False if not specified
        
        i = 0
        while i < len(parts):
            part = parts[i]
            if part == '<t_start>':
                new_input_ids.append(self.time_id_start)
                i += 1 
                if i < len(parts):
                    new_indices = self.process_inner_text(parts[i])
                    new_input_ids.extend(new_indices)
                    i += 1
            elif part == '<t_end>':
                new_input_ids.append(self.time_id_end)
                i += 1
            else:
                ### tmp fix for extrac space problem ###
                token_ids = self._tokenizer.encode('\n' + parts[i], add_special_tokens=False)[2:]
                new_input_ids.extend(token_ids)
                i += 1

        if add_special_tokens:
            
            # Retrieve special tokens ids based on configuration in tokenizer
            special_tokens = []
            if hasattr(self._tokenizer, 'add_bos_token') and self._tokenizer.add_bos_token:
                bos_token_id = self._tokenizer.bos_token_id if hasattr(self._tokenizer, 'bos_token_id') else None
                if bos_token_id is not None:
                    special_tokens.append(bos_token_id)

            if hasattr(self._tokenizer, 'add_eos_token') and self._tokenizer.add_eos_token:
                eos_token_id = self._tokenizer.eos_token_id if hasattr(self._tokenizer, 'eos_token_id') else None
                if eos_token_id is not None:
                    special_tokens.append(eos_token_id)

            # Insert BOS token at the beginning if present
            if special_tokens and self._tokenizer.add_bos_token:
                new_input_ids.insert(0, special_tokens[0])

            # Append EOS token at the end if present
            if special_tokens and self._tokenizer.add_eos_token:
                new_input_ids.append(special_tokens[-1])

        return torch.tensor(new_input_ids, dtype = torch.float32)

    def process_inner_text(self, text):
        # Extract all numbers from the text as a PyTorch tensor for vectorized operations
        floats = torch.tensor([float(num) for num in re.split(r'\s*,\s*', text.strip('[]')) if num], dtype = torch.float32)
        # Clip the input values to be within the specified range
        floats_clipped = torch.clamp(floats, min=self.range_min, max=self.range_max)

        # Convert range_tokens to a tensor if not already
        range_tokens_floats = [float(token) for token in self.range_tokens]
        range_tokens_tensor = torch.tensor(range_tokens_floats, dtype=torch.float32)

        # Find the indices for each number using searchsorted
        positions = torch.searchsorted(range_tokens_tensor, floats_clipped, right=True) - 1
        positions = torch.clamp(positions, 0, len(self.range_tokens) - 2)  # Ensure indices are within the bounds

        # Lower and upper token values
        lower_tokens = range_tokens_tensor[positions]
        upper_tokens = range_tokens_tensor[positions + 1]

        # Calculate the interpolated indices
        lower_indices = self.float_token_id_start + positions

        # Compute the interpolated token indices
        fractional_part = (floats_clipped - lower_tokens) / (upper_tokens - lower_tokens)
        interpolated_indices = lower_indices + fractional_part

        return interpolated_indices


    def batch_decode(self, sequences, **kwargs):
        return [
            self.decode(
                seq,
                **kwargs,
            )
            for seq in sequences
        ]

    def decode(self, token_ids, **kwargs):
        if not len(token_ids):
            return ""
        
        # Ensure input is in a consistent format (using PyTorch for potential GPU support)
        if isinstance(token_ids, list) or isinstance(token_ids, np.ndarray):
            token_ids = torch.tensor(token_ids, dtype=torch.float32)
        # No need to convert if already a torch tensor
        # No need to convert if already a torch tensor
        
        if token_ids.device.type == 'cuda':
            device = token_ids.device
        else:
            device = torch.device('cpu')
        
        # Create a mask for float tokens using torch operations
        is_float = (self.float_token_id_start <= token_ids) & (token_ids <= self.float_token_id_end)

        # Find boundaries of chunks by changes in the mask
        diff = torch.diff(is_float.to(torch.int), dim=0)
        boundaries = torch.nonzero(diff, as_tuple=False).squeeze() + 1
        chunk_indices = torch.cat([torch.tensor([0], device=device), boundaries, torch.tensor([token_ids.size(0)], device=device)])

        # Process each chunk based on its type
        decoded_parts = []
        for i in range(chunk_indices.size(0) - 1):
            chunk = token_ids[chunk_indices[i]:chunk_indices[i+1]]
            if is_float[chunk_indices[i]]:

                # Interpolate float values
                base_ids = chunk.long() - self.float_token_id_start
                lower_bounds = torch.tensor([float(self.range_tokens[idx]) for idx in base_ids], device=device)
                upper_bounds = torch.tensor([float(self.range_tokens[min(idx + 1, len(self.range_tokens) - 1)]) for idx in base_ids], device=device)
                fractional_parts = chunk.frac()
                interpolated_floats = lower_bounds + fractional_parts * (upper_bounds - lower_bounds)
                float_strs = [f"{num.item():.2f}" for num in interpolated_floats]
                decoded_parts.append(", ".join(float_strs))
            else:
                if isinstance(chunk, list):
                    chunk = list(map(int, chunk))
                else:
                    chunk = chunk.long()
                # Decode non-float tokens using the tokenizer (assumes moving to CPU for compatibility)
                decoded_text = self._tokenizer.decode(chunk, **kwargs)
                decoded_parts.append(decoded_text)
        return "".join(decoded_parts)
    
    def token_to_float(self, token_ids):
        device = token_ids.device
        dtype = torch.float32
        
        # Ensure token_ids is a 2D tensor even if it's not batched
        if token_ids.ndim == 1:
            token_ids = token_ids.unsqueeze(0)  # Reshape to [1, n]

        # Calculate base indices for each token
        base_ids = (token_ids // 100).long() - self.float_token_id_start

        # Collect lower and upper bounds from pre-defined ranges
        lower_bounds = torch.tensor([float(self.range_tokens[idx]) for idx in base_ids.flatten()], dtype=torch.bfloat16, device=device).view_as(base_ids)
        upper_bounds = torch.tensor([float(self.range_tokens[min(idx + 1, len(self.range_tokens) - 1)]) for idx in base_ids.flatten()], dtype=torch.bfloat16, device=device).view_as(base_ids)

        # Calculate fractional parts for interpolation
        fractional_parts = (token_ids % 100).float() / 100

        # Perform interpolation
        interpolated_floats = lower_bounds + fractional_parts * (upper_bounds - lower_bounds)

        # Match the original shape of token_ids
        if interpolated_floats.ndim == 2 and interpolated_floats.shape[0] == 1:
            interpolated_floats = interpolated_floats.squeeze(0)  # Remove batch dimension if it was originally unbatched

        return interpolated_floats
    
    
    def float_to_tokenid(self, floats):
        # Clip the input values to be within the specified range
        floats_clipped = torch.clamp(floats, min=-180, max=180)

        # Convert range_tokens to a tensor if not already
        range_tokens_floats = [float(token) for token in self.range_tokens]
        range_tokens_tensor = torch.tensor(range_tokens_floats, device=floats.device, dtype=torch.float32)

        # Find the indices for each number using searchsorted
        positions = torch.searchsorted(range_tokens_tensor, floats_clipped, right=True) - 1
        positions = torch.clamp(positions, 0, len(self.range_tokens) - 2)  # Ensure indices are within the bounds

        # Lower and upper token values
        lower_tokens = range_tokens_tensor[positions]
        upper_tokens = range_tokens_tensor[positions + 1]

        # Calculate the interpolated indices
        lower_indices = self.float_token_id_start + positions

        # Compute the interpolated token indices
        fractional_part = (floats_clipped - lower_tokens) / (upper_tokens - lower_tokens)
        interpolated_indices = (100 * (lower_indices + fractional_part)).long()

        return interpolated_indices

# Example usage
original_tokenizer = AutoTokenizer.from_pretrained('../checkpoints/VideoLLaMA2-7B', use_fast=False)
time_tokenizer = TimeTokenizer(original_tokenizer)


In [29]:
# Extract all numbers from the text as a numpy array for vectorized operations
text = '14.79'
numbers = np.array([float(num) for num in re.split(r'\s*,\s*', text.strip('[]')) if num])

# Find the indices for each number using vectorized search
positions = np.searchsorted(time_tokenizer.sorted_tokens, numbers, side='right') - 1
positions = np.clip(positions, 0, len(time_tokenizer.sorted_tokens) - 2)  # Ensure indices are within the bounds

# Lower and upper token values
lower_tokens = np.array(time_tokenizer.range_tokens)[positions]
upper_tokens = np.array(time_tokenizer.range_tokens)[positions + 1]

# Calculate the interpolated indices
lower_indices = time_tokenizer.float_token_id_start + positions
upper_indices = time_tokenizer.float_token_id_start + positions + 1

# Compute the interpolated token indices
interpolated_indices = lower_indices + ((numbers - lower_tokens.astype(float)) / 
                                        (upper_tokens.astype(float) - lower_tokens.astype(float))) * (upper_indices - lower_indices)
#         return  interpolated_indices.tolist()
return [int(round(num * 100)) for num in interpolated_indices.tolist()]

In [30]:
interpolated_indices'


array([32011.7395])

In [5]:
time_tokenizer.float_token_id_end - time_tokenizer.float_token_id_start

360

In [106]:
torch.tensor([3236300])

tensor([3236300])

In [88]:
506*64

32384

In [21]:
token_ids = torch.ones(5).cuda().float()

is_float=(5 <= token_ids) & (token_ids <= 1)
is_float.any()

tensor(False, device='cuda:0')

In [18]:
time_tokenizer('time,123').input_ids

[1536, 28725, 28740, 28750, 28770]

In [9]:
token_exists

False

In [4]:
len(time_tokenizer)

32363

In [106]:
original_tokenizer.legacy = False

In [30]:
from transformers.generation.utils  import *

In [31]:
InfNanRemoveLogitsProcessor

transformers.generation.logits_process.InfNanRemoveLogitsProcessor

In [21]:
from transformers/generation
/utils.py

In [148]:
x.long()

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15],
        [16, 17, 18, 19]])

In [66]:
text = '''he current video records driving scenario: <video>\n Control Signal until current Frame Sequence is: Speed: <t_start>[14.79, 12.25, 8.65, 6.35, 5.28, 4.92, 4.825]<t_end>\n Curvature: <t_start>[-0.0, -0.0, 0.0, -0.0, 0.0, 0.0, -0.0]<t_end>\n Acceleration: <t_start>[-0.74, -2.74, -2.45, -1.42, -0.56, -0.18, -0.08]<t_end>\n Course: <t_start>[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]<t_end>\nWhat is the action of ego car?'''

In [67]:
print(text)

he current video records driving scenario: <video>
 Control Signal until current Frame Sequence is: Speed: <t_start>[14.79, 12.25, 8.65, 6.35, 5.28, 4.92, 4.825]<t_end>
 Curvature: <t_start>[-0.0, -0.0, 0.0, -0.0, 0.0, 0.0, -0.0]<t_end>
 Acceleration: <t_start>[-0.74, -2.74, -2.45, -1.42, -0.56, -0.18, -0.08]<t_end>
 Course: <t_start>[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]<t_end>
What is the action of ego car?


In [68]:
(torch.tensor([14.79]) -12)/4

tensor([0.6975])

In [69]:
t1 = time_tokenizer.encode(text, add_special_tokens = False)

In [70]:
t1

tensor([2.6500e+02, 1.8680e+03, 3.7980e+03, 7.8060e+03, 7.8100e+03, 1.3406e+04,
        2.8747e+04, 5.2300e+02, 9.8310e+03, 2.8767e+04, 1.3000e+01, 1.0336e+04,
        9.3150e+03, 2.8200e+02, 1.9960e+03, 1.8680e+03, 1.7624e+04, 2.2716e+04,
        6.3600e+02, 3.4900e+02, 2.8747e+04, 1.9586e+04, 2.8747e+04, 2.8705e+04,
        3.2000e+04, 3.2017e+04, 3.2017e+04, 3.2016e+04, 3.2015e+04, 3.2015e+04,
        3.2015e+04, 3.2015e+04, 3.2027e+04, 1.3000e+01, 6.1190e+03, 2.8728e+04,
        1.3730e+03, 2.8747e+04, 2.8705e+04, 3.2000e+04, 3.2014e+04, 3.2014e+04,
        3.2014e+04, 3.2014e+04, 3.2014e+04, 3.2014e+04, 3.2014e+04, 3.2027e+04,
        1.3000e+01, 4.0350e+03, 7.7080e+03, 3.5200e+02, 2.8747e+04, 2.8705e+04,
        3.2000e+04, 3.2013e+04, 3.2013e+04, 3.2013e+04, 3.2013e+04, 3.2013e+04,
        3.2013e+04, 3.2013e+04, 3.2027e+04, 1.3000e+01, 1.9688e+04, 2.8747e+04,
        2.8705e+04, 3.2000e+04, 3.2014e+04, 3.2014e+04, 3.2014e+04, 3.2014e+04,
        3.2014e+04, 3.2014e+04, 3.2014e+

In [71]:
t2 = time_tokenizer.decode(t1)
print(t2)

he current video records driving scenario: <video>
 Control Signal until current Frame Sequence is: Speed:  <t_start>14.79, 12.25, 8.65, 6.35, 5.28, 4.92, 4.83<t_end> 
 Curvature:  <t_start>0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00<t_end> 
 Acceleration:  <t_start>-0.74, -2.74, -2.45, -1.42, -0.56, -0.18, -0.08<t_end> 
 Course:  <t_start>0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00<t_end> 
What is the action of ego car?


In [23]:
print(t2)

he current video records driving scenario: <video>
 Control Signal until current Frame Sequence is: Speed:  <t_start>14.80, 12.20, 8.60, 6.40, 5.20, 5.00, 4.80<t_end> 
 Curvature:  <t_start>0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00<t_end> 
 Acceleration:  <t_start>-0.80, -2.80, -2.40, -1.40, -0.60, -0.20, 0.00<t_end> 
 Course:  <t_start>0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00<t_end> 
What is the action of ego car?


In [66]:
time_tokenizer.float_to_tokenid(torch.tensor([178.4700,  18.4500,  18.4000,  18.3500,  18.3100,  18.3500,  18.3900], dtype=torch.bfloat16))

tensor([3235900, 3219950, 3219937, 3219937, 3219925, 3219937, 3219937])