In [23]:
import torch
from torch.utils.data import DataLoader, Dataset, Subset
import torch.optim as optim
import torch.nn as nn
#from base.constants import *
#from ..base.helpers import *
from tqdm import tqdm
from transformers import pipeline, file_utils
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import get_linear_schedule_with_warmup
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import random
import os
import json
import re

In [24]:
# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA L4


In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
DIFFUSION_MODEL_VTT = '/content/drive/My Drive/data/dl4ds_diffusion_models.vtt'
GRAPH_NN_VTT = '/content/drive/My Drive/data/dl4ds_graph_nn.vtt'

In [27]:
# Define the regex pattern for timestamps
time_pattern = re.compile(
    r'(\d{2}:\d{2}\.\d{3} *--> *\d{2}:\d{2}\.\d{3})|'
    r'(\d{2}:\d{2}:\d{2}\.\d{3} *--> *\d{2}:\d{2}:\d{2}\.\d{3})'
)

# Function to process lines from a file and split by period, skipping the first line
def process_and_split_file(file_path):
    all_text = ''
    first_line_skipped = False
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if not first_line_skipped:
                first_line_skipped = True
                continue
            stripped_line = line.strip()
            # Skip empty lines or timestamps
            if not stripped_line or time_pattern.match(stripped_line):
                continue
            # Add a space before appending if the accumulated text is not empty
            if all_text and not all_text.endswith(' '):
                all_text += ' '
            all_text += stripped_line

    # Split the text at each period, ensuring each segment ends with a period
    split_texts = [text.strip() + '.' for text in all_text.split('.') if text.strip()]
    return split_texts

# Assume DIFFUSION_MODEL_VTT and GRAPH_NN_VTT are defined
diff_texts = process_and_split_file(DIFFUSION_MODEL_VTT)
graph_texts = process_and_split_file(GRAPH_NN_VTT)


In [28]:
CACHE_DIR = os.environ.get("TRANSFORMERS_CACHE")

try:
    tokenizer = AutoTokenizer.from_pretrained("igorsterner/xlmr-multilingual-sentence-segmentation",
                                              cache_dir=CACHE_DIR)

except Exception as e:
    print("You need to pick a pre-trained model from HuggingFace.")
    print("Exception: ", e)

In [29]:
def prepare_inputs(texts, max_length=330, print_samples = False):
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in texts:
        encoded_dict = tokenizer(
                            sent,                          # Sentence to encode.
                            max_length = max_length,              # Pad & truncate all sentences.
                            padding='max_length',
                            return_attention_mask = True,  # Construct attn. masks.
                            return_tensors = 'pt',         # Return pytorch tensors.
                    )

        # Add the encoded sentence to the list.
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    # print samples
    if print_samples == True:
        print('Original: ', texts[0])
        print('Token IDs:', input_ids[0])
        print('Attention Masks:', attention_masks[0])

    return input_ids, attention_masks

In [30]:
train_input_ids_1, train_attention_masks_1 = prepare_inputs(diff_texts)
train_input_ids_2, train_attention_masks_2 = prepare_inputs(graph_texts)

In [31]:
class SubtitleDataset(Dataset):
    def __init__(self, input_ids, attention_masks):
        self.input_ids = input_ids
        self.attention_masks = attention_masks

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx]

In [32]:
diff_dataset = SubtitleDataset(train_input_ids_1, train_attention_masks_1)
graph_dataset = SubtitleDataset(train_input_ids_2, train_attention_masks_2)

#diff_dataset = Subset(diff_dataset, range(1000))
#graph_dataset = Subset(graph_dataset, range(50))

print('{:>5,} DIFFUSION_MODEL_VTT samples'.format(len(diff_dataset)))
print('{:>5,} GRAPH_NN_VTT samples'.format(len(graph_dataset)))

batch_size = 16
diff_loader = DataLoader(diff_dataset, batch_size, shuffle=False)
graph_loader = DataLoader(graph_dataset, batch_size, shuffle=False)

  395 DIFFUSION_MODEL_VTT samples
  285 GRAPH_NN_VTT samples


In [33]:
# load the best model
best_model = AutoModelForTokenClassification.from_pretrained("igorsterner/xlmr-multilingual-sentence-segmentation")
best_model.classifier = torch.nn.Linear(best_model.classifier.in_features, 3)
best_model.num_labels = 3

path = '/content/drive/My Drive/best_model/state_dict.pt'
best_model.load_state_dict(torch.load(path))
best_model.to(device)

XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bi

In [34]:
def remove_special_tags(text):
    # Remove start and end tags
    text = text.replace("<s>", "")
    text = text.replace("</s>", "")
    return text

best_model.eval()

def add_eol_eob(loader):
    final_predictions = []
    modified_input_ids = []

    # Evaluate data for one epoch
    for batch in loader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)

        with torch.no_grad():
            result = best_model(b_input_ids,
                                attention_mask=b_input_mask,
                                return_dict=True)

            logits = result.logits
            masked_logits = logits[b_input_mask.bool()].view(-1, logits.size(-1))
            probabilities = torch.softmax(masked_logits, dim=-1)
            predictions = torch.argmax(probabilities, dim=-1)

            b_input_ids = b_input_ids[b_input_mask.bool()]

            final_predictions.append(predictions)
            modified_input_ids.append(b_input_ids)

    # Flatten the list of tensors into a single tensor
    final_predictions = torch.cat(final_predictions)
    modified_input_ids = torch.cat(modified_input_ids)

    # Include special tokens based on predictions before converting to tokens
    full_input_ids = []
    for id, pred in zip(modified_input_ids, final_predictions):
        full_input_ids.append(id)
        if pred == 1:
            # Append <eol> special token ids
            eol_ids = tokenizer("<eol>")['input_ids'][1:-1]  # exclude [CLS] and [SEP]
            full_input_ids.extend(eol_ids)
        elif pred == 2:
            # Append <eob> special token ids
            eob_ids = tokenizer("<eob>")['input_ids'][1:-1]  # exclude [CLS] and [SEP]
            full_input_ids.extend(eob_ids)

    # Convert the full list of input_ids to tokens
    full_text = tokenizer.decode(full_input_ids)

    clean_text = remove_special_tags(full_text)
    clean_text.strip()

    return clean_text


In [35]:
diff_labeled = add_eol_eob(diff_loader)
graph_labeled = add_eol_eob(graph_loader)

In [39]:
DIFFUSION_MODEL_TEXT = '/content/drive/My Drive/data/dl4ds_diffusion_models.txt'
GRAPH_NN_TEXT = '/content/drive/My Drive/data/dl4ds_graph_nn.txt'

def write_text_to_file(text, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(text)

write_text_to_file(diff_labeled, DIFFUSION_MODEL_TEXT)
write_text_to_file(graph_labeled, GRAPH_NN_TEXT)