In [None]:
!pip install stanza
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
import pandas as pd
import torch
from transformers import BertTokenizer, BertForMaskedLM
import torch.nn as nn
from tqdm import tqdm
import stanza
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from transformers import AutoModel, AutoTokenizer
import torch

drive.mount('/content/drive')

MessageError: ignored

In [None]:
data = pd.read_csv('/content/drive/MyDrive/NLP project/masked_data.csv')

In [None]:
data=data.drop(['Unnamed: 0'],axis=1)

In [None]:
data

In [None]:
class DepTreeNode:
    def __init__(self, map, head_node=None, head_rel=None):
        self.position = map['id']-1
        self.text = map['text']
        self.lemma = map['lemma'] if 'lemma' in map else map['text']
        self.upos = map['upos']
        self.ner = map['ner'] if map['ner'] != 'O' else None
        self.head_node = head_node
        self.head_rel = head_rel

    def set_head(self, node, rel):
        self.head_node = node
        self.head_rel = rel

    def __str__(self):
        return str({
            "text": self.text,
            "lemma": self.lemma,
            "upos": self.upos,
            "ner": self.ner,
            "head": self.head_node.text if self.head_node else "root",
            "head_rel": self.head_rel if self.head_rel else "root"
        })

class DepTree:
    def __init__(self, doc):
        self.root = None
        self.nodes = []
        self.doc = doc
        self.build_tree()

    def build_tree(self):
        for sent in self.doc.to_dict():
            for word in sent:
                node = DepTreeNode(word, None, None)
                self.nodes.append(node)
        for sent in self.doc.sentences:
            for word in sent.words:
                node = self.nodes[word.id - 1]
                if word.head == 0:
                    self.root = node
                else:
                    node.set_head(self.nodes[word.head - 1], word.deprel)

    def print_tree(self):
        for node in self.nodes:
            print(node.text, node.head_node)

    def get_node_by_text(self, text):
        for node in self.nodes:
            if node.text == text:
                return node
        return None

    def get_node_by_lemma(self, lemma):
        for node in self.nodes:
            if node.lemma == lemma:
                return node
        return None

    def get_node_by_upos(self, upos):
        for node in self.nodes:
            if node.upos == upos:
                return node
        return None

    def get_node_by_ner(self, ner):
        results = []
        for node in self.nodes:
            if node.ner and ner in node.ner:
                results.append(node)
        return results


nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma,pos,depparse,ner', use_gpu=True)

class SoftEmbedding(nn.Module):
    def __init__(self, tokenizer, wte, device):
        super(SoftEmbedding, self).__init__()
        self.tokenizer = tokenizer
        self.wte = wte
        self.device = device
        self.wte.requires_grad = False
        self.id2token = {v: k for k, v in tokenizer.get_vocab().items()}
        self.learned_embedding = self.initialize_embedding(wte)

    def initialize_embedding(self,
                             wte: nn.Embedding,
                             n_tokens: int = 10,
                             n_prompts: int = 1,
                             random_range: float = 0.5,
                             initialize_from_vocab: bool = True):
        """initializes learned embedding

        Args:
            same as __init__

        Returns:
            torch.float: initialized using original schemes
        """
        embedding = nn.Sequential(
            nn.Linear(wte.weight.size(1), 100),
            nn.ReLU(),
            nn.Linear(100, wte.weight.size(1))
        )
        for param in embedding.parameters():
            param.requires_grad = True

        return embedding

    def forward(self, tokens):
        tokens = tokens[0]
        tree = DepTree(nlp(self.tokenizer.decode(tokens, skip_special_tokens=True)))
        entities = tree.get_node_by_ner("PERSON")

        input_embedding = self.wte(tokens)
        max_length = input_embedding.size(0)

        token_positions = []
        for i, token in enumerate(map(lambda x: self.id2token[x.item()], tokens)):
            if token == "[CLS]" or token == "[SEP]":
                continue
            if len(token_positions) >= 1 and token.startswith("##"):
                token_positions[-1].append(i)
            else:
                token_positions.append([i])

        if len(entities) > 0:
            entity_embeddings = []
            for ent in entities:
                with torch.no_grad():
                    ent_tokens = self.tokenizer(f'{ent.text}', return_tensors="pt", add_special_tokens=False).input_ids.squeeze(0).to(self.device)
                    ent_embedding = self.wte(ent_tokens)
                learned_embedding = self.learned_embedding(ent_embedding)
                entity_embeddings.append((learned_embedding, token_positions[ent.position]))  # Store the learned embedding and position

            padded_embedding = torch.zeros_like(input_embedding)
            for ent_emb, pos in entity_embeddings:
                if pos == ent_emb.size()[0]:
                    padded_embedding[tuple(pos), :] = ent_emb
                # else:
                #     print(self.tokenizer(f'{ent.text}', return_tensors="pt", add_special_tokens=False).input_ids)
                #     print(tokens)

            final_embedding = torch.cat([input_embedding, padded_embedding], dim=0)
            final_embedding = torch.mean(final_embedding, dim=0)
        else:
            final_embedding = input_embedding

        return final_embedding


In [None]:
import torch
import pandas as pd

batch_size=16
def collate_fn(data):
    triple_tensors = [item['masked_triple'] for item in data]
    label_tensors = [item['label'] for item in data]
    return {'masked_triple': triple_tensors, 'label': label_tensors}

# Create the data dictionary

data_dict = [{'masked_triple': triple, 'label': label} for triple, label in zip(data['masked_triple'], data['label'])]

# Create the data loader
train_loader = torch.utils.data.DataLoader(data_dict, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)


In [None]:
num_epochs = 10
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

for param in model.parameters():
    param.requires_grad = False

s_wte = SoftEmbedding(tokenizer, model.get_input_embeddings(), device)
s_wte.to(device)
# s_wte.learned_embedding.weight.requires_grad = True  # Enable gradient computation for the SoftEmbedding weights
model.set_input_embeddings(s_wte)
model.to(device)

# Define your loss function
criterion = nn.CrossEntropyLoss()

# Define your optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# print("parameters:",model.input_embeddings.parameters())

criterion = criterion.to(device)

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
torch.cuda.empty_cache()

In [None]:
model.train()

for epoch in range(num_epochs):
    total_loss = 0
    total_correct = 0
    total_predicted = 0
    total_true = 0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch", ncols=80,
                        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]")

    for batch in progress_bar:
        # Skip the last batch if its size is different
        # print(batch['masked_triple'])
        inputs = tokenizer(batch['masked_triple'], return_tensors="pt", padding=True).to(device)
        labels = tokenizer(batch['label'], return_tensors="pt", padding=True, add_special_tokens=False)['input_ids'].squeeze(1).to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits

        # Get the positions of the [mask] tokens
        mask_positions = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero()
        print('\n mask_positions',mask_positions)

        # Ensure the number of mask positions matches the number of labels
        if mask_positions.shape[0] > labels.shape[0]:
            mask_positions = mask_positions[:labels.shape[0]]
        elif mask_positions.shape[0] < labels.shape[0]:
            labels = labels[:mask_positions.shape[0]]

        # Extract the logits for the [mask] tokens
        mask_token_logits = logits[mask_positions[:, 0], mask_positions[:, 1]]

        # Reshape the mask_token_logits to (batch_size, num_tokens)
        mask_token_logits = mask_token_logits.view(-1, mask_token_logits.size(1))

        # Compute the loss
        print('mask_token_logits',mask_token_logits)
        print('labels',labels)
        loss = criterion(mask_token_logits, labels)

        # Backward pass and optimization step
        loss.requires_grad = True
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        # Compute accuracy and precision
        predicted_labels = mask_token_logits.argmax(dim=1)
        print('predicted_labels',predicted_labels)
        predicted = tokenizer.convert_ids_to_tokens(predicted_labels)
        print('predicted_label ',predicted )


        correct_predictions = (predicted_labels == labels).sum().item()
        total_correct += correct_predictions
        total_predicted += predicted_labels.shape[0]
        total_true += labels.shape[0]

        # Update the progress bar
        progress_bar.set_postfix({"Loss": loss.item()})

    # Calculate average loss for the epoch
    avg_loss = total_loss / len(train_loader)

    # Calculate accuracy and precision
    accuracy = total_correct / total_predicted if total_predicted > 0 else 0.0
    precision = total_correct / total_true if total_true > 0 else 0.0

    # Print training progress
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss} - Accuracy: {accuracy} - Precision: {precision}")


Epoch 1/1:   0%|                            | 1/2438 [00:00<10:11,  3.99batch/s]


 mask_positions tensor([[ 0,  4],
        [ 1,  5],
        [ 2,  4],
        [ 3,  5],
        [ 4,  5],
        [ 5,  4],
        [ 6, 10],
        [ 7,  7]], device='cuda:0')

 mask_token_logits tensor([[ -6.3824,  -6.3039,  -6.3134,  ...,  -5.4256,  -5.5260,  -6.3814],
        [-12.7677, -12.4405, -12.3537,  ..., -10.1908, -10.3768,  -7.3917],
        [ -6.4181,  -6.4669,  -6.1966,  ...,  -5.9131,  -5.3796,  -5.0969],
        ...,
        [ -4.7784,  -4.8262,  -4.7203,  ...,  -4.4492,  -4.3024,  -3.5021],
        [-18.3105, -18.5880, -18.6839,  ..., -14.9700, -15.3607,  -8.0037],
        [ -8.0554,  -8.2927,  -8.0615,  ...,  -6.0881,  -6.2808,  -5.2628]],
       device='cuda:0')
labels tensor([ 3883,  2388,  5011,  2269,  2269,  2269, 11062,  2365],
       device='cuda:0')
predicted_labels tensor([3772, 1007, 1000, 1007, 1007, 3883, 1997, 3364], device='cuda:0')
predicted_label  ['acting', ')', '"', ')', ')', 'actress', 'of', 'actor']


Epoch 1/1:   0%|                            | 2/2438 [00:00<10:15,  3.95batch/s]


 mask_positions tensor([[0, 6],
        [1, 4],
        [2, 7],
        [3, 2],
        [4, 5],
        [5, 6],
        [6, 4],
        [7, 6]], device='cuda:0')

 mask_token_logits tensor([[-6.0712, -6.0065, -5.9116,  ..., -4.4230, -6.6993, -3.7195],
        [-6.0951, -6.0660, -6.1100,  ..., -5.8881, -5.4047, -3.9726],
        [-4.9700, -5.1150, -5.0448,  ..., -4.4110, -3.8234, -5.8172],
        ...,
        [-6.0370, -5.7947, -5.8401,  ..., -4.9044, -6.1585, -3.4996],
        [-6.3195, -6.3842, -6.3281,  ..., -5.8451, -5.5342, -3.8215],
        [-6.9128, -6.9694, -6.9227,  ..., -6.7296, -6.5282, -3.9347]],
       device='cuda:0')
labels tensor([2269, 3364, 2269, 2269, 2269, 3364, 2269, 2269], device='cuda:0')
predicted_labels tensor([ 5348,  1012,  7762, 11748,  2015,  1012,  1000,  1000],
       device='cuda:0')
predicted_label  ['juan', '.', 'manuel', 'val', '##s', '.', '"', '"']


Epoch 1/1:   0%|                            | 3/2438 [00:00<10:11,  3.98batch/s]


 mask_positions tensor([[0, 5],
        [1, 9],
        [2, 4],
        [3, 5],
        [4, 4],
        [5, 6],
        [6, 6],
        [7, 5]], device='cuda:0')

 mask_token_logits tensor([[-7.3562, -7.3285, -7.2631,  ..., -7.6567, -8.5106, -4.3380],
        [-9.1891, -9.3458, -9.1652,  ..., -8.5337, -9.3880, -2.3002],
        [-7.2738, -7.1299, -7.2300,  ..., -6.7712, -6.7133, -5.0466],
        ...,
        [-7.3621, -7.2307, -7.3101,  ..., -6.3387, -5.9449, -3.6265],
        [-7.7758, -7.7432, -7.6247,  ..., -7.4682, -8.1875, -2.6293],
        [-7.2823, -7.1956, -7.1474,  ..., -7.5639, -8.0125, -4.8480]],
       device='cuda:0')
labels tensor([2269, 2269, 2269, 2388, 3883, 2388, 2269, 2269], device='cuda:0')
predicted_labels tensor([ 1011,  2696,  1000,  1011,  1000,  1000, 10722,  1011],
       device='cuda:0')
predicted_label  ['-', '##ta', '"', '-', '"', '"', 'tu', '-']


Epoch 1/1:   0%|                            | 4/2438 [00:01<11:14,  3.61batch/s]


 mask_positions tensor([[ 0,  5],
        [ 1,  5],
        [ 2,  6],
        [ 3,  5],
        [ 4, 12],
        [ 5,  8],
        [ 6,  8],
        [ 7,  7]], device='cuda:0')

 mask_token_logits tensor([[-6.1322, -6.2838, -6.2975,  ..., -5.8645, -8.1095, -3.8601],
        [-6.7358, -6.8633, -6.9383,  ..., -7.1597, -8.1958, -4.6753],
        [-2.8892, -2.9802, -2.9664,  ..., -3.3825, -2.8855, -0.8501],
        ...,
        [-4.5719, -5.0994, -5.0724,  ..., -4.0209, -5.2390,  1.3105],
        [-5.3371, -5.5365, -5.6740,  ..., -4.4111, -5.7814,  1.1618],
        [-2.9426, -3.2866, -3.1500,  ..., -2.9610, -2.2995, -3.5965]],
       device='cuda:0')
labels tensor([3677, 3364, 2269, 3364, 2269, 2269, 2269, 3364], device='cuda:0')
predicted_labels tensor([ 1095,  1010,  8962,  1000, 29653,  2819,  2819, 11124],
       device='cuda:0')
predicted_label  ['×', ',', 'pot', '"', '##ª', '##um', '##um', '##eri']


Epoch 1/1:   0%|                            | 5/2438 [00:01<10:54,  3.72batch/s]


 mask_positions tensor([[0, 3],
        [1, 7],
        [2, 5],
        [3, 9],
        [4, 9],
        [5, 9],
        [6, 7],
        [7, 9]], device='cuda:0')

 mask_token_logits tensor([[-4.4381, -4.4215, -4.4682,  ..., -5.1119, -4.6195, -1.1377],
        [ 5.7838,  5.3000,  5.4964,  ...,  5.0116,  4.3568,  8.9287],
        [-7.1148, -7.1145, -7.4236,  ..., -8.1055, -7.0917, -1.2187],
        ...,
        [ 6.4041,  6.1282,  6.2591,  ...,  5.9109,  5.7375,  9.1724],
        [ 5.2202,  4.7446,  4.7224,  ...,  4.0379,  4.3550,  6.5617],
        [ 3.7039,  3.3402,  3.2352,  ...,  3.1318,  2.1897,  7.8530]],
       device='cuda:0')
labels tensor([2269, 2365, 2269, 2269, 2388, 2388, 2269, 2388], device='cuda:0')
predicted_labels tensor([ 1010,   100,  1012, 29653, 29651, 22931,   100, 29653],
       device='cuda:0')
predicted_label  [',', '[UNK]', '.', '##ª', '##¨', 'antibodies', '[UNK]', '##ª']


Epoch 1/1:   0%|                            | 6/2438 [00:01<10:53,  3.72batch/s]


 mask_positions tensor([[ 0,  6],
        [ 1,  6],
        [ 2,  9],
        [ 3,  7],
        [ 4,  7],
        [ 5,  5],
        [ 6,  4],
        [ 7, 11]], device='cuda:0')

 mask_token_logits tensor([[ -6.0029,  -5.8260,  -5.8320,  ...,  -5.8164,  -6.8427,  -3.8348],
        [ -5.6371,  -5.5067,  -5.5656,  ...,  -5.5728,  -6.7881,  -3.0286],
        [ -7.1331,  -7.0327,  -6.9876,  ...,  -7.5015,  -7.1799,  -6.1062],
        ...,
        [ -7.8852,  -8.0425,  -8.0982,  ...,  -7.7921,  -7.4175,  -5.2233],
        [-12.1812, -12.4561, -12.6897,  ..., -11.9986, -10.4507,  -9.2595],
        [  0.0326,  -0.1649,  -0.2278,  ...,  -0.9706,  -0.5247,   3.1064]],
       device='cuda:0')
labels tensor([2269, 2388, 2269, 2269, 2269, 3364, 2269, 2269], device='cuda:0')
predicted_labels tensor([ 1010,  1010, 15966,  4079,  4079, 15966,  2139,  1149],
       device='cuda:0')
predicted_label  [',', ',', 'wit', 'simon', 'simon', 'wit', 'de', 'ˈ']


Epoch 1/1:   0%|                            | 7/2438 [00:01<10:51,  3.73batch/s]


 mask_positions tensor([[0, 2],
        [1, 4],
        [2, 4],
        [3, 4],
        [4, 4],
        [5, 3],
        [6, 7],
        [7, 5]], device='cuda:0')

 mask_token_logits tensor([[-4.4559, -4.3763, -4.5545,  ..., -3.6323, -3.4793, -1.1110],
        [-5.7046, -6.2758, -5.8890,  ..., -5.7065, -6.3404, -4.9024],
        [-6.9030, -7.2734, -7.0418,  ..., -6.1628, -7.2002, -3.6040],
        ...,
        [-6.5432, -6.6436, -6.4766,  ..., -5.9289, -5.4427, -4.0624],
        [-6.3769, -6.5482, -6.2480,  ..., -4.8471, -6.2093, -3.9494],
        [-8.1663, -8.8895, -8.4464,  ..., -7.5459, -8.3997, -5.3600]],
       device='cuda:0')
labels tensor([2365, 3364, 2269, 3364, 2388, 2269, 2269, 2269], device='cuda:0')
predicted_labels tensor([15897,  2638,  1011,  1024,  1998,  1012,  2023,  2009],
       device='cuda:0')
predicted_label  ['chiefly', '##ne', '-', ':', 'and', '.', 'this', 'it']


Epoch 1/1:   0%|                            | 8/2438 [00:02<10:53,  3.72batch/s]


 mask_positions tensor([[0, 2],
        [1, 3],
        [2, 8],
        [3, 4],
        [4, 6],
        [5, 6],
        [6, 8],
        [7, 4]], device='cuda:0')

 mask_token_logits tensor([[-5.8008, -5.7109, -5.6467,  ..., -6.0016, -6.5881, -4.5866],
        [-6.5217, -6.7125, -6.6963,  ..., -7.3144, -7.2918, -3.9761],
        [ 6.6827,  6.2874,  6.0564,  ...,  7.0578,  4.8281,  7.5941],
        ...,
        [-9.8807, -9.4202, -9.4538,  ..., -7.1984, -8.2479, -7.5629],
        [10.9023, 10.7020, 10.7643,  ..., 10.3482,  9.1432, 11.3458],
        [-4.6874, -4.6249, -4.5423,  ..., -4.0267, -5.1512, -4.2554]],
       device='cuda:0')
labels tensor([2388, 2269, 2269, 3364, 2269, 2269, 2269, 3750], device='cuda:0')
predicted_labels tensor([ 1010, 20377, 30035,  8202,  1012,  1012, 29653,  2669],
       device='cuda:0')
predicted_label  [',', 'bis', '##ᵉ', '##mon', '.', '.', '##ª', '##ton']


Epoch 1/1:   0%|                            | 9/2438 [00:02<11:05,  3.65batch/s]


 mask_positions tensor([[0, 4],
        [1, 7],
        [2, 9],
        [3, 5],
        [4, 6],
        [5, 4],
        [6, 6],
        [7, 7]], device='cuda:0')

 mask_token_logits tensor([[-8.0535, -8.2596, -7.8989,  ..., -7.2782, -7.0633, -4.4812],
        [-5.2107, -4.7282, -5.0854,  ..., -5.3535, -4.2850, -3.4282],
        [-9.2336, -8.8427, -8.8995,  ..., -6.7304, -7.2694, -9.6242],
        ...,
        [-7.1121, -7.3039, -7.1576,  ..., -7.8774, -6.4014, -4.1722],
        [-7.8429, -7.7248, -7.5089,  ..., -5.7206, -4.3461, -7.5254],
        [-5.4844, -5.6013, -5.6854,  ..., -5.0500, -3.3998, -4.3460]],
       device='cuda:0')
labels tensor([ 3364,  2269,  2388,  2269,  2269,  2269,  2269, 10248],
       device='cuda:0')
predicted_labels tensor([ 2011, 22084,  1012,  6754, 12631,  1006, 12631, 22084],
       device='cuda:0')
predicted_label  ['by', '##nea', '.', 'greg', 'kin', '(', 'kin', '##nea']





KeyboardInterrupt: ignored

In [None]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_correct = 0
    total_predicted = 0
    total_true = 0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch", ncols=80,
                        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]")

    for batch in progress_bar:
        inputs = tokenizer(batch['masked_triple'], return_tensors="pt", padding=True).to(device)
        labels = tokenizer(batch['label'], return_tensors="pt", padding=True, add_special_tokens=False)['input_ids'].squeeze(1).to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        # print('\n logits',logits.size())

        mask_token_logits = []
        mask_positions = []
        # Iterate over the batch
        for i in range(len(batch['masked_triple'])):
            masked_index = (inputs.input_ids[i] == tokenizer.mask_token_id).nonzero().item()
            mask_positions.append(masked_index)
            predicted_token_index = torch.argmax(logits[i, masked_index]).item()
            mask_token_logits.append(predicted_token_index)
            predicted_label = tokenizer.convert_ids_to_tokens(predicted_token_index)
            print('predicted_label ',predicted_label )

        print(len(batch['masked_triple']))
        mask_token_logits = torch.tensor(mask_token_logits).to(device).float()
        mask_positions = torch.tensor(mask_positions).to(device)
        print(mask_token_logits)
        print(labels)
        labels = labels.float()

        # Compute the loss
        loss = criterion(mask_token_logits, labels)

        # Backward pass and optimization step
        loss.requires_grad = True
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        # Compute accuracy and precision
        predicted_labels = mask_token_logits
        correct_predictions = (predicted_labels == labels).sum().item()
        total_correct += correct_predictions
        total_predicted += predicted_labels.shape[0]
        total_true += labels.shape[0]

        # Update the progress bar
        progress_bar.set_postfix({"Loss": loss.item()})

    # Calculate average loss for the epoch
    avg_loss = total_loss / len(train_loader)

    # Calculate accuracy and precision
    accuracy = total_correct / total_predicted if total_predicted > 0 else 0.0
    precision = total_correct / total_true if total_true > 0 else 0.0

    # Print training progress
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss} - Accuracy: {accuracy} - Precision: {precision}")


Epoch 1/1:   0%|                            | 2/2438 [00:00<06:00,  6.75batch/s]

predicted_label  -
predicted_label  ##thus
predicted_label  ##trip
predicted_label  ⊕
predicted_label  -
predicted_label  .
predicted_label  -
predicted_label  ##ᵉ
8
tensor([ 1011., 19877., 24901.,  1612.,  1011.,  1012.,  1011., 30035.],
       device='cuda:0')
tensor([2269, 2269, 2269, 2269, 2269, 2684, 2388, 3159], device='cuda:0')
predicted_label  /
predicted_label  flora
predicted_label  all
predicted_label  and
predicted_label  all
predicted_label  this
predicted_label  ,
predicted_label  ulysses
8
tensor([ 1013., 10088.,  2035.,  1998.,  2035.,  2023.,  1010., 22784.],
       device='cuda:0')
tensor([3364, 2269, 2269, 2269, 2388, 2269, 3677, 2269], device='cuda:0')


Epoch 1/1:   0%|                            | 4/2438 [00:00<06:52,  5.91batch/s]

predicted_label  coach
predicted_label  ##la
predicted_label  ##la
predicted_label  .
predicted_label  et
predicted_label  ,
predicted_label  "
predicted_label  ##z
8
tensor([2873., 2721., 2721., 1012., 3802., 1010., 1000., 2480.],
       device='cuda:0')
tensor([2269, 2269, 2269, 2269, 3364, 2269, 2269, 2269], device='cuda:0')
predicted_label  actor
predicted_label  ##guchi
predicted_label  ##uy
predicted_label  yu
predicted_label  z
predicted_label  ma
predicted_label  "
predicted_label  in
8
tensor([ 3364., 16918., 26230.,  9805.,  1062.,  5003.,  1000.,  1999.],
       device='cuda:0')
tensor([3364, 2388, 2269, 2269, 2269, 2269, 2269, 3364], device='cuda:0')


Epoch 1/1:   0%|                            | 6/2438 [00:00<06:36,  6.14batch/s]

predicted_label  emperor
predicted_label  ##ª
predicted_label  ##rian
predicted_label  tr
predicted_label  .
predicted_label  ptolemy
predicted_label  [UNK]
predicted_label  replaces
8
tensor([ 3750., 29653.,  6862., 19817.,  1012., 23517.,   100., 20736.],
       device='cuda:0')
tensor([3750, 2269, 2269, 2388, 2269, 2269, 2269, 2269], device='cuda:0')
predicted_label  and
predicted_label  most
predicted_label  this
predicted_label  ##ably
predicted_label  and
predicted_label  and
predicted_label  most
predicted_label  the
8
tensor([1998., 2087., 2023., 8231., 1998., 1998., 2087., 1996.],
       device='cuda:0')
tensor([2269, 2269, 2269, 2269, 2269, 2269, 2269, 2388], device='cuda:0')


Epoch 1/1:   0%|                            | 6/2438 [00:01<07:29,  5.41batch/s]

predicted_label  ##a
predicted_label  ,





RuntimeError: ignored

In [None]:
model_path = '/content/gdrive/MyDrive/model.pth'
# torch.save(model, model_path)

model.save_pretrained(model_path)

NameError: ignored

In [None]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

# Load the pre-trained model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

# Define your input texts as a batch
input_texts = ['maria [MASK] john', 'hello [MASK] world', 'I [MASK] a book', 'the cat [MASK] the mouse', 'where is [MASK]', '[MASK] is my dog', 'can [MASK] help me', 'open the [MASK]']

# Tokenize the input batch
input_ids = tokenizer.batch_encode_plus(input_texts, add_special_tokens=True, padding='longest', return_tensors='pt')

# Generate predictions
with torch.no_grad():
    outputs = model(input_ids.input_ids)
    predictions = outputs.logits

# Iterate over the batch
for i in range(len(input_texts)):
    # Find the predicted token for the masked position in each input
    masked_index = (input_ids.input_ids[i] == tokenizer.mask_token_id).nonzero().item()
    predicted_token_index = torch.argmax(predictions[i, masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens(predicted_token_index)

    # Print the predicted token for each input
    print(f"Input: {input_texts[i]}")
    print("Predicted token for [MASK]:", predicted_token)
    print()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Input: maria [MASK] john
Predicted token for [MASK]: and

Input: hello [MASK] world
Predicted token for [MASK]: ,

Input: I [MASK] a book
Predicted token for [MASK]: have

Input: the cat [MASK] the mouse
Predicted token for [MASK]: and

Input: where is [MASK]
Predicted token for [MASK]: .

Input: [MASK] is my dog
Predicted token for [MASK]: this

Input: can [MASK] help me
Predicted token for [MASK]: you

Input: open the [MASK]
Predicted token for [MASK]: .

