In [1]:
import os
os.chdir("D:/DL_TRY/src")
print(os.getcwd())

D:\DL_TRY\src


In [2]:
import torch
from torch.utils.data import DataLoader, Dataset, Subset
from torchvision import transforms
import torch.optim as optim
from base.constants import *
#from ..base.helpers import *
from tqdm import tqdm
from transformers import pipeline, file_utils
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import get_linear_schedule_with_warmup
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import random
import os
import json

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to D:/DL_TRY/misc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 4050 Laptop GPU


In [4]:
# Load datasets 
train_texts = []
with open(TRAIN_WITHOUT_LABELS_FILE, 'r', encoding='utf-8') as file:
    for line in file:
        train_texts.append(line.strip())

train_texts_with_labels = []
with open(TRAIN_WITH_LABELS_FILE, 'r', encoding='utf-8') as file:
    for line in file:
        train_texts_with_labels.append(line.strip())

val_texts = []
with open(VAL_WITHOUT_LABELS_FILE, 'r', encoding='utf-8') as file:
    for line in file:
        val_texts.append(line.strip())

val_texts_with_labels = []
with open(VAL_WITH_LABELS_FILE, 'r', encoding='utf-8') as file:
    for line in file:
        val_texts_with_labels.append(line.strip())

In [5]:
CACHE_DIR = os.environ.get("TRANSFORMERS_CACHE")

try:
    tokenizer = AutoTokenizer.from_pretrained("igorsterner/xlmr-multilingual-sentence-segmentation", 
                                              cache_dir=CACHE_DIR)

except Exception as e:
    print("You need to pick a pre-trained model from HuggingFace.")
    print("Exception: ", e)

In [6]:
# Have a knowledge of the tokenizer.
# Print the original sentence.
print(' Original: ', train_texts[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(train_texts[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_texts[0])))

# Print special marks and their IDs.
print("CLS token:", tokenizer.cls_token, tokenizer.cls_token_id)
print("SEP token:", tokenizer.sep_token, tokenizer.sep_token_id)

 Original:  Thank you so much, Chris. And it's truly a great honor to have the opportunity to come to this stage twice; I'm extremely grateful.
Tokenized:  ['▁Thank', '▁you', '▁so', '▁much', ',', '▁Chris', '.', '▁And', '▁it', "'", 's', '▁truly', '▁a', '▁great', '▁honor', '▁to', '▁have', '▁the', '▁opportunity', '▁to', '▁come', '▁to', '▁this', '▁stage', '▁twice', ';', '▁I', "'", 'm', '▁extremely', '▁grateful', '.']
Token IDs:  [25689, 398, 221, 5045, 4, 31745, 5, 3493, 442, 25, 7, 87607, 10, 6782, 20338, 47, 765, 70, 54591, 47, 1380, 47, 903, 36541, 186351, 74, 87, 25, 39, 111531, 225876, 5]
CLS token: <s> 0
SEP token: </s> 2


In [7]:
# Define functions to convert texts with <eol> and <eob> tags to output format
def mark_breakpoints(input_ids):
    # Initialize a list of zeros for marks, same length as input_ids
    marks = [0] * len(input_ids)
    
    i = 0
    while i < len(input_ids):
        # tokenizer("") == {'input_ids': [0, 2], 'attention_mask': [1, 1]}
        # tokenizer("<eob>") == {'input_ids': [0, 4426, 13, 3522, 2740, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]}
        # tokenizer("<eol>") == {'input_ids': [0, 4426, 13, 929, 2740, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]}
        # Check for the patterns corresponding to <eob> or <eol>
        if input_ids[i:i+4] == [4426, 13, 3522, 2740] or input_ids[i:i+4] == [4426, 13, 929, 2740]:
            # Determine the marker value, 2 for <eob> and 1 for <eol>
            mark_value = 2 if input_ids[i+2] == 3522 else 1
            if i > 0:  # Ensure it is not at the start of the sequence
                marks[i-1] = mark_value  # Mark the previous token
            # Remove the tokens associated with <eob> or <eol>
            del marks[i:i+4]
            del input_ids[i:i+4]
        else:
            i += 1
    
    return marks

def texts2labels(texts_with_labels):
    labels = []
    for i in texts_with_labels:
        label = mark_breakpoints(tokenizer(i)['input_ids'])
        labels.append(label)
    return labels


In [8]:
train_labels = texts2labels(train_texts_with_labels)
val_labels = texts2labels(val_texts_with_labels)

In [9]:
max_len = 0

# For every sentence...
for sent in train_texts:
    input_ids = tokenizer(sent)['input_ids']
    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  300


In [10]:
def prepare_inputs_outputs(texts, labels, max_length=330, print_samples = False):
    input_ids = []
    attention_masks = []
    padded_labels = []

    # For every sentence...
    for sent in texts:
        encoded_dict = tokenizer(
                            sent,                          # Sentence to encode.
                            max_length = 330,              # Pad & truncate all sentences.
                            padding='max_length',
                            return_attention_mask = True,  # Construct attn. masks.
                            return_tensors = 'pt',         # Return pytorch tensors.
                    )

        # Add the encoded sentence to the list.
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    # Pad labels
    padded_labels = torch.tensor([label + [-1] * (max_length - len(label)) for label in labels])

    # print samples
    if print_samples == True:
        print('Original: ', texts[0])
        print('Token IDs:', input_ids[0])
        print('Attention Masks:', attention_masks[0])
        print('labels', padded_labels[0])

    return input_ids, attention_masks, padded_labels


In [11]:
def prepare_inputs_outputs(texts, labels, max_length=330, print_samples = False):
    input_ids = []
    attention_masks = []
    padded_labels = []

    # For every sentence...
    for sent in texts:
        encoded_dict = tokenizer(
                            sent,                          # Sentence to encode.
                            max_length = 330,              # Pad & truncate all sentences.
                            padding='max_length',
                            return_attention_mask = True,  # Construct attn. masks.
                            return_tensors = 'pt',         # Return pytorch tensors.
                    )

        # Add the encoded sentence to the list.
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    #input_ids = torch.cat(input_ids, dim=0)
    #attention_masks = torch.cat(attention_masks, dim=0)

    # Pad labels
    padded_labels = torch.tensor([label + [0] * (max_length - len(label)) for label in labels])

    # print samples
    if print_samples == True:
        print('Original: ', texts[0])
        print('Token IDs:', input_ids[0])
        print('Attention Masks:', attention_masks[0])
        print('labels', padded_labels[0])

    return input_ids, attention_masks, padded_labels

In [12]:
train_input_ids, train_attention_masks, train_padded_labels = prepare_inputs_outputs(train_texts, train_labels, 330, True)
val_input_ids, val_attention_masks, val_padded_labels = prepare_inputs_outputs(val_texts, val_labels, 330)

Original:  Thank you so much, Chris. And it's truly a great honor to have the opportunity to come to this stage twice; I'm extremely grateful.
Token IDs: tensor([[     0,  25689,    398,    221,   5045,      4,  31745,      5,   3493,
            442,     25,      7,  87607,     10,   6782,  20338,     47,    765,
             70,  54591,     47,   1380,     47,    903,  36541, 186351,     74,
             87,     25,     39, 111531, 225876,      5,      2,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,    

In [13]:
class SubtitleDataset(Dataset):
    def __init__(self, input_ids, attention_masks, padded_labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.padded_labels = padded_labels
    
    def __len__(self):
        return len(self.input_ids)       

    def __getitem__(self, idx):
        """ input_id = torch.tensor(self.input_ids[idx], dtype=torch.long)
        attention_mask = torch.tensor(self.attention_masks[idx], dtype=torch.long)
        padded_label = torch.tensor(self.padded_labels[idx], dtype=torch.long) """
        
        # 返回处理后的单个样本
        #return input_id, attention_mask, padded_label
        return self.input_ids[idx], self.attention_masks[idx], self.padded_labels[idx]

In [17]:
train_dataset = SubtitleDataset(train_input_ids, train_attention_masks, train_padded_labels)
val_dataset = SubtitleDataset(val_input_ids, val_attention_masks, val_padded_labels)

print('{:>5,} training samples'.format(len(train_dataset)))
print('{:>5,} validation samples'.format(len(val_dataset)))

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size, shuffle=False)

# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_mask, sample_y = next(dataiter)

print('Sample input size: ', sample_x.size())# batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample mask: \n', sample_mask)
print('Sample output: \n', sample_y)

220,068 training samples
27,509 validation samples
Sample input size:  torch.Size([32, 1, 330])
Sample input: 
 tensor([[[    0, 25689,   398,  ...,     1,     1,     1]],

        [[    0,    87,   765,  ...,     1,     1,     1]],

        [[    0,  3493,    87,  ...,     1,     1,     1]],

        ...,

        [[    0,  4966,  2367,  ...,     1,     1,     1]],

        [[    0,    15,  2729,  ...,     1,     1,     1]],

        [[    0,    15,  2729,  ...,     1,     1,     1]]])
Sample mask: 
 tensor([[[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        ...,

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]]])
Sample output: 
 tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])


## Model Selection

In [14]:
model = AutoModelForTokenClassification.from_pretrained("igorsterner/xlmr-multilingual-sentence-segmentation")
model.classifier = torch.nn.Linear(model.classifier.in_features, 3)
model.num_labels = 3

model.to(device)
model.cuda()

XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bi

In [134]:
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr=2e-5, 
                              betas=(0.9, 0.999),
                              eps = 1e-8, 
                              weight_decay=0.0005,
                              )

epochs = 5

# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_loader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

# Training

In [150]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [151]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [152]:
torch.cuda.empty_cache()

In [20]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

# 加载分词器和模型
tokenizer = AutoTokenizer.from_pretrained("igorsterner/xlmr-multilingual-sentence-segmentation")
model = AutoModelForTokenClassification.from_pretrained("igorsterner/xlmr-multilingual-sentence-segmentation")
model.to(device)
#model.classifier = torch.nn.Linear(model.classifier.in_features, 2)
#model.num_labels = 3
#model.output_attentions = False
mask = sample_mask[0]

# 准备输入文本
text = train_texts[2]
inputs = tokenizer(text, return_tensors="pt")  # 确保返回 PyTorch 张量

# 使用模型进行预测
with torch.no_grad():  # 确保不计算梯度
    outputs = model(sample_x[0].to(device),sample_mask[0].to(device))

# 计算概率
probabilities = torch.softmax(outputs.logits, dim=-1)

# 获取每个标记的最大概率标签
predictions = torch.argmax(probabilities, dim=-1)

# 输出结果
print(predictions)


tensor([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0

In [21]:
with torch.no_grad():  # 确保不计算梯度
    outputs = model(sample_x[0].to(device), sample_mask[0].to(device))#, sample_y[0].to(device))

In [None]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
accumulation_steps = 2

training_stats = []

total_t0 = time.time()


for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_loader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_loader), elapsed))


        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()


        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss / accumulation_steps  # Scale loss
        loss.backward()

        if (step + 1) % accumulation_steps == 0:
            # Only perform optimization step every 'accumulation_steps'
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
            optimizer.step()
            scheduler.step()
            model.zero_grad()  # Zero the gradients after optimization

        total_train_loss += loss.item() * accumulation_steps  # Unscale the loss
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in val_loader:

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():

            result = model(b_input_ids,
                           token_type_ids=None,
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)

        loss = result.loss
        logits = result.logits

        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)


    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
def train(loger, train_dataloader, model, optimizer, device, processor):
    model.train()

    for idx, batch in progress_bar:
        input_ids = batch.pop("input_ids").to(device)
        pixel_values = batch.pop("pixel_values").to(device)
        # debug
        # print("Pixel_values",pixel_values)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids, pixel_values=pixel_values, labels=input_ids
        )

        loss = outputs.loss
        if torch.cuda.device_count() > 1:
            loss = loss.mean()
        loss.backward()

        optimizer.step()

        # Update progress bar with loss info
        progress_bar.set_postfix({"loss": loss.item()})

    return loss.item()

In [None]:
def train(loger, train_dataloader, model, optimizer, device, processor):
    model.train()

    for idx, batch in progress_bar:
        input_ids = batch.pop("input_ids").to(device)
        pixel_values = batch.pop("pixel_values").to(device)
        # debug
        # print("Pixel_values",pixel_values)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids, pixel_values=pixel_values, labels=input_ids
        )

        loss = outputs.loss
        if torch.cuda.device_count() > 1:
            loss = loss.mean()
        loss.backward()

        optimizer.step()

        # Update progress bar with loss info
        progress_bar.set_postfix({"loss": loss.item()})

    return loss.item()

In [None]:
def evaluate(
    logger, epoch, save_path, best_score, val_dataloader,model, processor, device
):
    # model_evaluate = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", cache_dir=f"{DEMO_SAVE_PATH}/model_{epoch}")
    # model_evaluate.to(device)
    model.eval()
    caption_val = []
    plot_captions_dict = {}
    for idx, batch in enumerate(val_dataloader):
        image_ids = batch.pop("image_ids").to(device)
        pixel_values = batch.pop("pixel_values").to(device)
        # debug
        # print("pixel_values",pixel_values)
        
        with torch.no_grad():
            outputs = model.generate(pixel_values=pixel_values, max_length=50)
            #outputs = model.generate(pixel_values=pixel_values)
            # debug
            #print("Raw Output:", outputs)

        # Decode the generated ids to text
        generated_captions = processor.batch_decode(outputs, skip_special_tokens=True)
        # debug
        #print("Decoded Output:", generated_captions)

        # Store the generated captions
        for img_id, caption in zip(image_ids, generated_captions):
            caption_val.append(
                {"image_id": img_id.item(), "caption": caption}
            )  # Used for VizWizEvalCap
            plot_captions_dict[img_id.item()] = caption  # Used for plotting

    # Save the generated captions to a json file
    # Change the path
    with open(f"{save_path}/generated_captions_{i}.json", "w") as f:
        json.dump(caption_val, f, indent=4)

    # Change the path
    vizwizRes = val_dataset.dataset.vizwiz.loadRes(
        f"{save_path}/generated_captions_{i}.json"
    )
    vizwizEval = VizWizEvalCap(val_dataset.dataset.vizwiz, vizwizRes)
    vizwizEval.evaluate()

    logger.info(f"Validation scores at epoch: {epoch}")
    for method in vizwizEval.eval:
        logger.info(f"  Method: {method}, Score: {vizwizEval.eval[method]:.4f}")

    return vizwizEval, vizwizRes, plot_captions_dict, model

In [132]:
sample_y[0]

tensor([ 0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,
         0,  2,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  2,  0, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, 

In [101]:
mask

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [93]:
sample_y[0]

tensor([ 0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,
         0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,
         2,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  2,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, 

In [59]:
tokenizer(text, return_tensors="pt")

{'input_ids': tensor([[    0,  3493,    87,  5154,   450, 43699, 21286,     4,  2831,   538,
          6637,    15,  9083,  2594, 19185,    16,    87,  3871,   450,     5,
            15,  2729, 46526,    56,    16,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]])}

In [100]:
tokenizer(train_texts[2], return_tensors="pt")

{'input_ids': tensor([[    0,  3493,    87,  5154,   450, 43699, 21286,     4,  2831,   538,
          6637,    15,  9083,  2594, 19185,    16,    87,  3871,   450,     5,
            15,  2729, 46526,    56,    16,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]])}

In [140]:
outputs

TokenClassifierOutput(loss=None, logits=tensor([[[ 3.7575, -2.8115],
         [ 5.2627, -5.0432],
         [ 4.3903, -4.2201],
         [ 5.7408, -5.5647],
         [ 4.3519, -3.8157],
         [ 4.3743, -4.7460],
         [ 3.1060, -2.8904],
         [-1.5279,  1.0472],
         [ 4.3476, -4.1248],
         [ 6.3177, -6.1020],
         [ 5.1890, -4.9129],
         [ 5.4288, -5.2133],
         [ 5.7272, -5.2702],
         [ 4.7480, -4.4442],
         [ 5.4618, -5.0336],
         [ 4.3085, -4.0404],
         [ 5.3212, -5.7122],
         [ 5.9274, -5.9662],
         [ 4.1552, -4.1624],
         [ 5.0201, -4.9719],
         [ 5.4669, -5.7374],
         [ 6.0409, -5.7752],
         [ 5.7806, -6.0583],
         [ 5.4140, -5.2766],
         [ 3.1619, -2.6765],
         [ 3.5358, -2.9870],
         [ 0.8151, -0.7760],
         [ 5.5859, -5.7451],
         [ 5.3556, -5.2577],
         [ 5.2833, -5.1451],
         [ 5.4452, -5.0072],
         [ 3.7298, -3.4053],
         [-2.2294,  1.6910],
   

In [141]:
probabilities

tensor([[[9.9860e-01, 1.4013e-03],
         [9.9997e-01, 3.3434e-05],
         [9.9982e-01, 1.8216e-04],
         [9.9999e-01, 1.2305e-05],
         [9.9972e-01, 2.8362e-04],
         [9.9989e-01, 1.0940e-04],
         [9.9752e-01, 2.4814e-03],
         [7.0762e-02, 9.2924e-01],
         [9.9979e-01, 2.0911e-04],
         [1.0000e+00, 4.0384e-06],
         [9.9996e-01, 4.0998e-05],
         [9.9998e-01, 2.3889e-05],
         [9.9998e-01, 1.6745e-05],
         [9.9990e-01, 1.0182e-04],
         [9.9997e-01, 2.7663e-05],
         [9.9976e-01, 2.3661e-04],
         [9.9998e-01, 1.6153e-05],
         [9.9999e-01, 6.8339e-06],
         [9.9976e-01, 2.4412e-04],
         [9.9995e-01, 4.5759e-05],
         [9.9999e-01, 1.3615e-05],
         [9.9999e-01, 7.3850e-06],
         [9.9999e-01, 7.2181e-06],
         [9.9998e-01, 2.2758e-05],
         [9.9710e-01, 2.9049e-03],
         [9.9853e-01, 1.4673e-03],
         [8.3077e-01, 1.6923e-01],
         [9.9999e-01, 1.1995e-05],
         [9.9998e-01

In [4]:
CACHE_DIR = os.environ.get("TRANSFORMERS_CACHE")
pipe = pipeline("token-classification", model="igorsterner/xlmr-multilingual-sentence-segmentation")

model.safetensors: 100%|██████████| 1.11G/1.11G [01:45<00:00, 10.5MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
tokenizer_config.json: 100%|██████████| 1.18k/1.18k [00:00<00:00, 119kB/s]
tokenizer.json: 100%|██████████| 17.1M/17.1M [00:01<00:00, 15.7MB/s]
special_tokens_map.json: 100%|██████████| 280/280 [00:00<00:00, 21.8kB/s]


In [30]:
pipe(train_texts[0])

[{'entity': '|',
  'score': 0.9292383,
  'index': 7,
  'word': '.',
  'start': 24,
  'end': 25},
 {'entity': '|',
  'score': 0.9805526,
  'index': 32,
  'word': '.',
  'start': 130,
  'end': 131}]