$
question: [N, n, Tq, H]\\
answer: [N, n, Ta, H]
$

$(matmul): H -> H'$

$
q: [N, n, Tq, H'] \\
k: [N, n, Ta, H'] \\
v: [N, n, Ta, H']
$


$
scores = qk = [N, n, Tq, Ta] \\
q\_query = [N, n, Tq, Ta], ~ sum(axis=Ta) == 1 \\
k\_query = [N, n, Tq, Ta], ~ sum(axis=Tq) == 1 \\
-> E1 = sum(q\_query@k * k\_query): [N, n, Tq, H'] -> [N, n, 1, H'] \\
-> E2 = sum(q^T@k\_query * q\_query): [N, n, H', Ta] -> [N, n, H', 1]
$

$
-> O = [N, n, 3, H] \\
-> FC -> [N, n, H]
$

-> output = $[[CLS], attn1, attn2]$

In [None]:
!pip install sentencepiece transformers

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetens

In [None]:
import torch
from torch import nn
import torch.nn.functional as tf
from torch.utils.data import Dataset, DataLoader

In [None]:
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding
from transformers.tokenization_utils_base import BatchEncoding

In [None]:
import os

In [None]:
# model = AutoModel.from_pretrained("xlm-roberta-large", output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [None]:
class FeedForwardLayer(nn.Module):
	def __init__(self, in_units, hidden_units, out_units, dropout_rate=0.1):
		super(FeedForwardLayer, self).__init__()

		self.fc = nn.Linear(in_units, hidden_units)
		self.drop = nn.Dropout(dropout_rate)
		self.out_fc = nn.Linear(hidden_units, out_units)

	def forward(self, inputs):
		x = tf.gelu(self.fc(inputs))
		x = self.drop(x)
		x = self.out_fc(x)
		return x

In [None]:
class MergeMultiHeadAttention(nn.Module):
	def __init__(self, feature_units, attention_units, num_heads):
		super(MergeMultiHeadAttention, self).__init__()
		self.num_heads = num_heads
		self.attention_units = attention_units

		assert attention_units % self.num_heads == 0

		self.depth = attention_units // self.num_heads

		self.Wq = nn.Linear(feature_units, attention_units)
		self.Wk = nn.Linear(feature_units, attention_units)

	def split_heads(self, x):
		# 4D Batch: [N, n, Tq, H'] -> 5D:  -> [N, n, H, Tq, D]
		N, n, T, _ = x.shape

		x = x.view(N, n, T, self.num_heads, -1)
		return torch.transpose(x, 3, 2)

	def merge_heads(self, x):
		N, n, h, _ = x.shape
		x = x.view(N, n, -1)
		return x

	def forward(self, q, k, q_mask=None, k_mask=None):
		# mask_q: [N, Tq]
		# mask_k: [N, Tk]
		wq = self.Wq(q)
		wk = self.Wk(k)

		wq = self.split_heads(wq)
		wk = self.split_heads(wk)
		q_transposed = torch.transpose(wq, -1, -2)
		k_transposed = torch.transpose(wk, -1, -2)

		matmul_qk = torch.matmul(wq, k_transposed)  # [N, n, H, Tq, Tv]

		scores = matmul_qk / torch.sqrt(torch.tensor(self.attention_units, dtype=torch.float32))
		scores_q_masked = scores
		scores_k_masked = scores

		if q_mask is not None:
			q_mask = q_mask[:, None, None, :, None]
			scores_q_masked += (q_mask * -1e9)

		if k_mask is not None:
			k_mask = k_mask[:, None, None, None, :]
			scores_k_masked += (k_mask * -1e9)

		q_weights = torch.softmax(scores_k_masked, -1)
		k_weights = torch.softmax(scores_q_masked, -2)
		q_pos_weights = torch.softmax(torch.sum(scores_q_masked, dim=-1, keepdim=True), dim=-2)
		k_pos_weights = torch.softmax(torch.sum(scores_k_masked, dim=-2, keepdim=True), dim=-1)

		attn_out1 = torch.matmul(q_weights, wk) * q_pos_weights  # # [N, n, H, Tq, H']
		attn_out2 = torch.matmul(q_transposed, k_weights) * k_pos_weights  # # [N, n, H, H', Ta]

		attn_out1 = self.merge_heads(torch.sum(attn_out1, dim=-2))
		attn_out2 = self.merge_heads(torch.sum(attn_out2, dim=-1))

		return (attn_out1, attn_out2)  # [N, n, H]


In [None]:
class MergeAdditiveAttention(nn.Module):
	def __init__(self, in_units, attention_units: int):
		super(MergeAdditiveAttention, self).__init__()
		self.Wq = nn.Linear(in_units, attention_units)
		self.Wk = nn.Linear(in_units, attention_units)
		self.fc = nn.Linear(attention_units, 1)

	def _calculate_scores(self, query, key):
		"""Calculates attention scores as a nonlinear sum of query and key.

		Args:
			query: Query tensor of shape `[batch_size, Tq, dim]`.
			key: Key tensor of shape `[batch_size, Tv, dim]`.
		Returns:
			Tensor of shape `[batch_size, Tq, Tv]`.
		"""
		# Reshape tensors to enable broadcasting.
		# Reshape into [batch_size, Tq, 1, dim].
		q_reshaped = torch.unsqueeze(query, dim=-2)
		# Reshape into [batch_size, 1, Tv, dim].
		k_reshaped = torch.unsqueeze(key, dim=-3)

		x = self.fc(tf.tanh(q_reshaped + k_reshaped))  # [batch_size, Tq, Tv, S] -> [batch_size, Tq, Tv, 1]
		x = x.squeeze(-1)  # [batch_size, Tq, Tv]
		return x

	def forward(self, query, key):
		w_q = self.Wq(query)
		w_k = self.Wk(key)
		value = key

		scores = self._calculate_scores(w_q, w_k)
		weights = tf.softmax(scores, -1)  # [batch_size, Tq, Tv]
		coef_weights = tf.softmax(scores, 1)


		x = torch.bmm(coef_weights, value)  # [N, Tq, Tv] x [N, Tv, S] -> [batch_size, Tq, S]
		x = torch.sum(x, dim=1)
		return x


In [None]:
class Classifier(nn.Module):
  def __init__(self, feature_units, attention_units, hidden_units, num_heads, num_outputs):
    super(Classifier, self).__init__()
    self.mmha = MergeMultiHeadAttention(feature_units, attention_units, num_heads)
    self.additive_attention = MergeAdditiveAttention(feature_units, attention_units)
    self.ffn = FeedForwardLayer(attention_units, hidden_units, feature_units)
    self.layer_norm = nn.LayerNorm(feature_units)
    self.ffn_out = FeedForwardLayer(feature_units, hidden_units, num_outputs)

  def forward(self, q, k, q_mask, k_mask):
    outputs = self.mmha(q, k, q_mask, k_mask)  # [N, Tq, S]
    x = torch.cat(outputs, dim=1)  # [N, 2Tq, S]
    x = self.ffn(x)
    x = self.additive_attention(x, x)  # [N, S]
    x = self.ffn_out(x)
    return x

In [None]:
classifier = Classifier(768, 1024, 2048, 8, 2)

In [None]:
# [N,n,Tq,H]
a = torch.randn([2, 4, 6, 768])
b = torch.randn([2, 4, 10, 768])
a_mask = torch.tensor([[1, 1, 1, 1, 0, 0],
                       [1, 1, 1, 1, 1, 1]])
b_mask = torch.tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
                       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [None]:
cls_outs = classifier(a, b, None, None)  #a_mask, b_mask)

In [None]:
cls_outs

tensor([[-0.0074,  0.0095],
        [-0.0056,  0.0122]], grad_fn=<AddmmBackward0>)

In [None]:
sum(p.numel() for p in classifier.parameters())

8404227

In [None]:
class QaModel(nn.Module):
  def __init__(self, model):
    super(QaModel, self).__init__()
    self.language_model = model
    self.classifier = Classifier(768, 1024, 2048, 8, 2)
    # self.set_trainable(False)

  def set_trainable(self, value):
    for p in self.language_model.parameters():
      p.requires_grad = value

  def forward(self, q_inputs, t_inputs):
    question_outputs = self.language_model(**q_inputs)
    text_outputs = self.language_model(**t_inputs)
    q_features = question_outputs["hidden_states"][-4:]
    t_features = text_outputs["hidden_states"][-4:]
    q_features = torch.stack(q_features, dim=1)
    t_features = torch.stack(t_features, dim=1)
    logits = self.classifier(q_features, t_features, q_inputs["attention_mask"], t_inputs["attention_mask"])

    return logits

  def save(self, model_name: str, weights_only: bool = True):
    save_path = os.path.join("save", model_name)
    # Check whether the specified path exists or not
    is_exist = os.path.exists(save_path)
    if not is_exist:
      # Create a new directory because it does not exist
      os.makedirs(save_path)
    else:
      print(f"There is already a model saved with the name {model_name}, which will be overwritten by new version!")
    if weights_only:
      weights_file = "qc-weights.pt"
      torch.save(self.state_dict(), os.path.join(save_path, weights_file))

    else:
      model_file = "qc-model.pt"
      torch.save(self, os.path.join(save_path, model_file))



In [None]:
qa_model = QaModel(model)

In [None]:
del qa_model

In [None]:
sum(p.numel() for p in qa_model.parameters())

570392579

In [None]:
sum(p.numel() for p in model.parameters())

559890432

# Dataset

In [None]:
class QADataset(Dataset):
    def __init__(self, tokenizer, data):
        self.tokenizer = tokenizer
        self.questions = data["questions"]
        self.texts = data["texts"]
        self.labels = data["labels"]

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        text = self.texts[idx]
        label = self.labels[idx]

        question_encoding = self.tokenizer(question, truncation=True)
        text_encoding = self.tokenizer(text, truncation=True)


        return question_encoding , text_encoding, label


def read_qa_data(input_path):
    labels = []
    questions = []
    texts = []

    with open(input_path, "r", encoding='utf-8') as rf:
        for line in rf:
            question, text, label = line.strip().split("\t")
            questions.append(question.strip())
            texts.append(text.strip())
            labels.append(1 if label.strip() == "true" else 0)

    data = {
        "questions": questions,
        "texts": texts,
        "labels": labels
    }

    return data


class DataCollator():
  def __init__(self, tokenizer):
    self.pad_fn = DataCollatorWithPadding(tokenizer, return_tensors="pt")

  def __extract__(self, batch_encoding):
    list_q = []
    list_t = []
    labels = []
    for q, t, label in batch_encoding:
      list_q.append(q)
      list_t.append(t)
      labels.append(label)

    return list_q, list_t, labels

  def __call__(self, batch_encoding):
    q_features, t_features, labels = self.__extract__(batch_encoding)
    q_out_padding = self.pad_fn(q_features)
    t_out_padding = self.pad_fn(t_features)
    labels = torch.tensor(labels)
    return (q_out_padding, t_out_padding), labels


def create_QA_dataset(input_path, tokenizer):
    data = read_qa_data(input_path)
    dataset = QADataset(tokenizer, data)
    return dataset

In [None]:
train_dataset = create_QA_dataset("/content/train_test_origin_1k_dev.csv",
	                                  tokenizer)
val_dataset = create_QA_dataset("/content/val_origin_1k.csv",
	                                tokenizer)

In [None]:
collator = DataCollator(tokenizer)
train_dataloader = DataLoader(train_dataset, collate_fn=collator, batch_size=8)
val_dataloader = DataLoader(val_dataset, collate_fn=collator, batch_size=8)

In [None]:
del train_dataset
del val_dataset

In [None]:
del train_dataloader
del val_dataloader

In [None]:
for batch in val_dataloader:
  break

In [None]:
batch[1]

tensor([0, 1, 1, 0, 0, 1, 1, 0])

# Train

In [None]:
from sklearn.metrics import f1_score, accuracy_score
from torch.optim.lr_scheduler import LRScheduler

In [None]:
class WarmupLinearLR(LRScheduler):
	def __init__(self,
	             optimizer,
	             warmup_steps,
	             total_steps,
	             min_proportion=0.0,
	             last_epoch=-1,
	             verbose=False):

		self.warmup_steps = warmup_steps
		self.max_steps = (total_steps - min_proportion * warmup_steps) / (1.0 - min_proportion)
		super(WarmupLinearLR, self).__init__(optimizer, last_epoch, verbose)

	def get_lr(self):
		if self.last_epoch == 0:
			return [group['lr'] * 0.1 / self.warmup_steps for group in self.optimizer.param_groups]

		if self.last_epoch > self.max_steps:
			return [group['lr'] for group in self.optimizer.param_groups]

		if self.last_epoch < self.warmup_steps:
			return [group['initial_lr'] * self.last_epoch / self.warmup_steps for group in self.optimizer.param_groups]
		else:
			return [group['initial_lr'] * (self.max_steps - self.last_epoch) / (self.max_steps - self.warmup_steps) for
			        group in self.optimizer.param_groups]

	def _get_closed_form_lr(self):
		if self.last_epoch < self.warmup_steps:
			return [base_lr * self.last_epoch / self.warmup_steps for base_lr in self.base_lrs]
		else:
			return [base_lr * (self.max_steps - self.last_epoch) / (self.max_steps - self.warmup_steps) for base_lr
			        in self.base_lrs]


In [None]:
EPOCHS = 20
gradient_accumulation_steps = 5
steps_per_epoch = len(train_dataloader)
total_steps = EPOCHS * (len(train_dataloader) // gradient_accumulation_steps)
warmup_steps = int(total_steps*0.1)

In [None]:
total_steps

8500

In [None]:
warmup_steps

850

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = AutoModel.from_pretrained("xlm-roberta-base", output_hidden_states=True)

In [None]:
qa_model = QaModel(model).to(device)

In [None]:
sum(p.numel() for p in qa_model.parameters())

286447875

In [None]:
sum(p.numel() for p in model.parameters())

278043648

In [None]:
del model
del qa_model

In [None]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
  {'params': [p for n, p in qa_model.named_parameters() if not any(nd in n for nd in no_decay)],
    'weight_decay': 0.001},
  {'params': [p for n, p in qa_model.named_parameters() if any(nd in n for nd in no_decay)],
    'weight_decay': 0.0}
]

In [None]:
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=2e-5, weight_decay=0.0)
scheduler = WarmupLinearLR(optimizer, warmup_steps, total_steps, min_proportion=0.0)

In [None]:
max_grad_norm = 1.0
eval_steps = 5
save_checkpoint = True

In [None]:
def evaluate(model, val_dataloader, val_steps, device):
	model.eval()
	with torch.no_grad():
		total_loss = 0.0
		y_trues = []
		y_preds = []

		for batch in val_dataloader:
			inputs, labels = batch

			logits = model(inputs[0].to(device), inputs[1].to(device))
			loss = tf.cross_entropy(logits, labels.to(device))

			predicts = torch.argmax(logits, dim=1)
			y_preds.extend(predicts.cpu().numpy().tolist())
			y_trues.extend(labels.cpu().numpy().tolist())

			total_loss += loss.item()

		f1_micro = f1_score(y_trues, y_preds)
		accuracy = accuracy_score(y_trues, y_preds)

		validation_result = {
			"loss": round(total_loss / val_steps, 4),
			"accuracy": round(accuracy, 4),
			"micro_f1": round(f1_micro, 4)
		}

	return validation_result


In [None]:
device

device(type='cuda')

In [None]:
log_writer = open("/content/logs/train-qc.log", "w")
log_writer.write("               ***** Start training *****\n")
log_writer.write("============================================================\n")
log_writer.write(f"Num samples: {len(train_dataset)}\n")
log_writer.write(f"Num epochs: {EPOCHS}\n")
log_writer.write(f"Gradient accumulation steps = {gradient_accumulation_steps}\n")
log_writer.write("============================================================\n")

monitor_f1 = float('-inf')

# qa_model.set_trainable(False)

for epoch in range(EPOCHS):

  total_loss = 0.0
  y_trues = []
  y_preds = []

  log_writer.write("------------------------------------------------------------\n")
  log_writer.write(f"Epoch {epoch + 1:>3d}/{EPOCHS}:\n")

  global_steps = 0

  print(f"Epoch \033[92m{epoch + 1:>3d}/{EPOCHS}\033[00m:")

  for step, batch in enumerate(train_dataloader):
    print(f"\r- Step \033[96m{step + 1:>5d}/{steps_per_epoch}\033[00m:", end="")

    qa_model.train()

    inputs, labels = batch

    logits = qa_model(inputs[0].to(device), inputs[1].to(device))
    loss = tf.cross_entropy(logits, labels.to(device))

    predicts = torch.argmax(logits, dim=1)
    y_preds.extend(predicts.cpu().numpy().tolist())
    y_trues.extend(labels.cpu().numpy().tolist())

    total_loss += loss.item()
    loss /= gradient_accumulation_steps
    loss.backward()

    if (step + 1) % gradient_accumulation_steps == 0 or (step == steps_per_epoch - 1):
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

      optimizer.step()
      scheduler.step()  # Update learning rate schedule
      optimizer.zero_grad()
      global_steps += 1
      # if global_steps == 50:
      #   log_writer.write("Fully training\n")
      #   qa_model.set_trainable(True)

      if global_steps % eval_steps == 0 or (step == steps_per_epoch - 1):
        print()
        logging_line = f"- Step: {step + 1:>5d}/{steps_per_epoch}, lr: {scheduler.get_last_lr()}\n"
        log_writer.write(logging_line)
        # print(y_trues)
        # print(y_preds)
        f1_micro = f1_score(y_trues, y_preds)
        accuracy = accuracy_score(y_trues, y_preds)

        train_accumulate_loss = round(total_loss / (step + 1), 4)
        train_accumulate_micro_f1 = round(f1_micro, 4)
        train_accumulate_accuracy = round(accuracy, 4)

        train_result_line = (f"{'loss':8s}: {train_accumulate_loss:<10.4f} - "
                              f"{'accuracy':12s}: {train_accumulate_accuracy:<10.4f} - "
                              f"{'f1':12s}: {train_accumulate_micro_f1:<10.4f}")

        print(f"    \033[95m{'Train result':20s}\033[00m - {train_result_line}")
        log_writer.write(f"    {'Train result':20s} - {train_result_line}\n")

        validation_output = evaluate(qa_model, val_dataloader, len(val_dataloader), device)

        val_result_line = (f"val_loss: {validation_output['loss']:<10.4f} - "
                            f"val_accuracy: {validation_output['accuracy']:<10.4f} - "
                            f"val_f1: {validation_output['micro_f1']:<10.4f}")

        print(f"    \033[95m{'Validation result':20s}\033[00m - {val_result_line}")
        log_writer.write(f"    {'Validation result':20s} - {val_result_line}\n")
        if save_checkpoint:
          if validation_output['micro_f1'] > monitor_f1:
            qa_model.save("xlm-roberta-large-qa")
            log_writer.write(
              f"    # val_f1 improve from {monitor_f1} to {validation_output['micro_f1']}. "
              "Saving model with name \"xlm-roberta-large-qa\"")
            monitor_f1 = validation_output["micro_f1"]

        log_writer.write("\n")

log_writer.write("                ***** End training *****\n")
log_writer.close()

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch [92m  1/20[00m:
- Step [96m   25/2125[00m:
    [95mTrain result        [00m - loss    : 0.7005     - accuracy    : 0.4100     - f1          : 0.4381    
    [95mValidation result   [00m - val_loss: 0.7006     - val_accuracy: 0.3707     - val_f1: 0.4628    
There is already a model saved with the name xlm-roberta-large-qa, which will be overwritten by new version!
- Step [96m   50/2125[00m:
    [95mTrain result        [00m - loss    : 0.7010     - accuracy    : 0.4000     - f1          : 0.4231    
    [95mValidation result   [00m - val_loss: 0.6980     - val_accuracy: 0.3716     - val_f1: 0.4632    
There is already a model saved with the name xlm-roberta-large-qa, which will be overwritten by new version!
- Step [96m   75/2125[00m:
    [95mTrain result        [00m - loss    : 0.6981     - accuracy    : 0.4383     - f1          : 0.4556    
    [95mValidation result   [00m - val_loss: 0.6941     - val_accuracy: 0.3896     - val_f1: 0.4534    
- Step [96m  100

In [None]:
log_writer.close()

In [None]:
# [N, n, H, Tq, Tv]
w = torch.randn((2, 4, 8, 14, 80))

In [None]:
a1 = torch.randn((2, 4, 8, 14, 80))
a2 = torch.randn((2, 4, 8, 80, 128))

In [None]:
torch.matmul(a1, a2).shape

torch.Size([2, 4, 8, 14, 128])

In [None]:
torch.sum(torch.matmul(a1, a2) * torch.sum(w, -1, keepdims=True), dim=-2).shape

torch.Size([2, 4, 8, 128])

In [None]:
bb = torch.sum(w, -1, keepdim=True)

In [None]:
batch

(({'input_ids': tensor([[     0,  16042, 174999,  56629,   2249, 146182,  26245,  32570,  50572,
             2933,   3941,      2,      1,      1,      1,      1,      1,      1],
          [     0,  21433, 144769,    454,    605, 191269,   3531,    524,  10587,
            19605,   4546,    580,   4062,      2,      1,      1,      1,      1],
          [     0,  90542,  68312,  83073,   3811,   2933,   8609,  60649,      2,
                1,      1,      1,      1,      1,      1,      1,      1,      1],
          [     0,  42812,   7630,  35459, 185165,    912, 122484,  26422,    580,
             4062,      2,      1,      1,      1,      1,      1,      1,      1],
          [     0, 182286, 142721, 136388,    524,   8609,  60649,   8725,  14352,
              544,  62633,      2,      1,      1,      1,      1,      1,      1],
          [     0,  19167,   2494,  16151,   5031,   7674,   7453,  25310,  63748,
            23598,    449,   2059,   3042,    308,    580,   1300,  

In [None]:
torch.sum(torch.softmax(bb, dim=-2), dim=-2)

In [None]:
x = torch.randn(2, 2, requires_grad=True)
y = x.view(4)  # Thay đổi kích thước tensor
print(y.grad_fn)  # <UnsafeViewBackward object at 0x...>

<ViewBackward0 object at 0x7e4777682ef0>


In [None]:
FC_layer = nn.Linear(16, 1)

In [None]:
inp = torch.randn(2, 16)

In [None]:
dhf = FC_layer(inp)

In [None]:
dhf.cpu().detach().numpy()

RuntimeError: ignored

In [None]:
p = torch.ones((3,4), requires_grad = True)

p1 = p.permute(1,0).flatten().contiguous()
p2 = p.transpose(1,0)
p3 = torch.matmul(p, p2)
p1, p2, p3

(tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        grad_fn=<UnsafeViewBackward0>),
 tensor([[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]], grad_fn=<TransposeBackward0>),
 tensor([[4., 4., 4.],
         [4., 4., 4.],
         [4., 4., 4.]], grad_fn=<MmBackward0>))

# ___

In [None]:
import torch.nn.functional as torch_fn

In [None]:
from matplotlib import pyplot as plt

In [None]:
m = nn.GELU()
input = torch.randn(2)
output = m(input)

In [None]:
output2 = torch_fn.gelu(input)

In [None]:
output

tensor([ 0.1217, -0.0480])

In [None]:
output2

tensor([ 0.1217, -0.0480])

In [None]:
BM25 -> score ->
Thêm câu hỏi
Top > 0.95
