In [1]:
import torch
import numpy as np

## gpu_ids

In [2]:
def get_available_devices():
    """Get IDs of all available GPUs.

    Returns:
        device (torch.device): Main device (GPU 0 or CPU).
        gpu_ids (list): List of IDs of all GPUs that are available.
    """
    gpu_ids = []
    if torch.cuda.is_available():
        gpu_ids += [gpu_id for gpu_id in range(torch.cuda.device_count())]
        device = torch.device(f'cuda:{gpu_ids[0]}')
        torch.cuda.set_device(device)
    else:
        device = torch.device('cpu')

    return device, gpu_ids

In [3]:
get_available_devices()

(device(type='cpu'), [])

## npz 文件的用法？

In [4]:
data = np.load("data/test.npz")

## class SQuAD

- SQuAD类继承自torch.utils.data.Dataset
- 初始化参数包括data_path和use_v2(bool)
- 功能是把np.load("?.npz")的一个dict转化为类。

In [5]:
import torch.utils.data as data

class SQuAD(data.Dataset):
    """Stanford Question Answering Dataset (SQuAD).

    Each item in the dataset is a tuple with the following entries (in order):
        - context_idxs: Indices of the words in the context.
            Shape (context_len,).
        - context_char_idxs: Indices of the characters in the context.
            Shape (context_len, max_word_len).
        - question_idxs: Indices of the words in the question.
            Shape (question_len,).
        - question_char_idxs: Indices of the characters in the question.
            Shape (question_len, max_word_len).
        - y1: Index of word in the context where the answer begins.
            -1 if no answer.
        - y2: Index of word in the context where the answer ends.
            -1 if no answer.
        - id: ID of the example.

    Args:
        data_path (str): Path to .npz file containing pre-processed dataset.
        use_v2 (bool): Whether to use SQuAD 2.0 questions. Otherwise only use SQuAD 1.1.
    """
    def __init__(self, data_path, use_v2=True):
        super(SQuAD, self).__init__()

        dataset = np.load(data_path)
        self.context_idxs = torch.from_numpy(dataset['context_idxs']).long()
        self.context_char_idxs = torch.from_numpy(dataset['context_char_idxs']).long()
        self.question_idxs = torch.from_numpy(dataset['ques_idxs']).long()
        self.question_char_idxs = torch.from_numpy(dataset['ques_char_idxs']).long()
        self.y1s = torch.from_numpy(dataset['y1s']).long()
        self.y2s = torch.from_numpy(dataset['y2s']).long()

        if use_v2:
            # SQuAD 2.0: Use index 0 for no-answer token (token 1 = OOV)
            batch_size, c_len, w_len = self.context_char_idxs.size()
            ones = torch.ones((batch_size, 1), dtype=torch.int64)
            self.context_idxs = torch.cat((ones, self.context_idxs), dim=1)
            self.question_idxs = torch.cat((ones, self.question_idxs), dim=1)

            ones = torch.ones((batch_size, 1, w_len), dtype=torch.int64)
            self.context_char_idxs = torch.cat((ones, self.context_char_idxs), dim=1)
            self.question_char_idxs = torch.cat((ones, self.question_char_idxs), dim=1)

            self.y1s += 1
            self.y2s += 1

        # SQuAD 1.1: Ignore no-answer examples
        self.ids = torch.from_numpy(dataset['ids']).long()
        self.valid_idxs = [idx for idx in range(len(self.ids))
                           if use_v2 or self.y1s[idx].item() >= 0]

    def __getitem__(self, idx):
        idx = self.valid_idxs[idx]
        example = (self.context_idxs[idx],
                   self.context_char_idxs[idx],
                   self.question_idxs[idx],
                   self.question_char_idxs[idx],
                   self.y1s[idx],
                   self.y2s[idx],
                   self.ids[idx])

        return example

    def __len__(self):
        return len(self.valid_idxs)

In [25]:
devdata = SQuAD("data/dev.npz", use_v2=True)

In [27]:
devdata.y1s

tensor([35, 22, 56,  ...,  0,  0,  0])

In [28]:
devdata_np = np.load("data/dev.npz")
[devdata_np[i].shape for i in devdata_np.keys()]

[(5951, 400),
 (5951, 400, 16),
 (5951, 50),
 (5951, 50, 16),
 (5951,),
 (5951,),
 (5951,)]

In [29]:
list(devdata_np.keys())

['context_idxs',
 'context_char_idxs',
 'ques_idxs',
 'ques_char_idxs',
 'y1s',
 'y2s',
 'ids']

In [41]:
devdata_np['y2s'][0]

34

In [46]:
# 第0个句子，7个单词，单词最大长度设置为16
devdata_np['ques_char_idxs'][0][:7]

array([[64,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [14, 48, 17, 19,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 7,  5, 43,  6, 19, 18,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [10, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [30,  5, 18, 34, 17,  6, 42,  4,  0,  0,  0,  0,  0,  0,  0,  0],
       [12,  5,  7, 17, 19,  3, 42,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [63,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=int32)

In [48]:
# use_v2 = True, 增加第一个OOV
devdata.question_char_idxs[0][:8]

tensor([[ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
        [64,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [14, 48, 17, 19,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 7,  5, 43,  6, 19, 18,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [10, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [30,  5, 18, 34, 17,  6, 42,  4,  0,  0,  0,  0,  0,  0,  0,  0],
        [12,  5,  7, 17, 19,  3, 42,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [63,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [50]:
devdata.__len__()

5951

## collate_fn

In [27]:
def collate_fn(examples):
    """Create batch tensors from a list of individual examples returned
    by `SQuAD.__getitem__`. Merge examples of different length by padding
    all examples to the maximum length in the batch.

    Args:
        examples (list): List of tuples of the form (context_idxs, context_char_idxs,
        question_idxs, question_char_idxs, y1s, y2s, ids).

    Returns:
        examples (tuple): Tuple of tensors (context_idxs, context_char_idxs, question_idxs,
        question_char_idxs, y1s, y2s, ids). All of shape (batch_size, ...), where
        the remaining dimensions are the maximum length of examples in the input.

    Adapted from:
        https://github.com/yunjey/seq2seq-dataloader
    """
    def merge_0d(scalars, dtype=torch.int64):
        return torch.tensor(scalars, dtype=dtype)

    def merge_1d(arrays, dtype=torch.int64, pad_value=0):
        lengths = [(a != pad_value).sum() for a in arrays]
        padded = torch.zeros(len(arrays), max(lengths), dtype=dtype)
        for i, seq in enumerate(arrays):
            end = lengths[i]
            padded[i, :end] = seq[:end]
        return padded

    def merge_2d(matrices, dtype=torch.int64, pad_value=0):
        heights = [(m.sum(1) != pad_value).sum() for m in matrices]
        widths = [(m.sum(0) != pad_value).sum() for m in matrices]
        padded = torch.zeros(len(matrices), max(heights), max(widths), dtype=dtype)
        for i, seq in enumerate(matrices):
            height, width = heights[i], widths[i]
            padded[i, :height, :width] = seq[:height, :width]
        return padded

    # Group by tensor type
    context_idxs, context_char_idxs, \
        question_idxs, question_char_idxs, \
        y1s, y2s, ids = zip(*examples)

    # Merge into batch tensors
    context_idxs = merge_1d(context_idxs)
    context_char_idxs = merge_2d(context_char_idxs)
    question_idxs = merge_1d(question_idxs)
    question_char_idxs = merge_2d(question_char_idxs)
    y1s = merge_0d(y1s)
    y2s = merge_0d(y2s)
    ids = merge_0d(ids)

    return (context_idxs, context_char_idxs,
            question_idxs, question_char_idxs,
            y1s, y2s, ids)


In [36]:
batch_tensors_padded = collate_fn(testdata)

In [37]:
[i.size() for i in batch_tensors_padded]

[torch.Size([5915, 414]),
 torch.Size([5915, 414, 16]),
 torch.Size([5915, 40]),
 torch.Size([5915, 40, 16]),
 torch.Size([5915]),
 torch.Size([5915]),
 torch.Size([5915])]

In [54]:
example1 = (1, "str", [3])
example2 = (2, "str2", [100])
examples = (example1, example2)
ints, strs, lists = zip(*examples)

In [56]:
ints, strs, lists

((1, 2), ('str', 'str2'), ([3], [100]))

In [58]:
model = torch.nn.Linear(5,4)
dict(model.named_parameters())

{'weight': Parameter containing:
 tensor([[-0.4117, -0.4105, -0.2888,  0.0490,  0.0045],
         [ 0.3999,  0.0009,  0.1255, -0.4044,  0.2461],
         [ 0.2509,  0.1652,  0.1981, -0.3752, -0.3934],
         [ 0.4258,  0.0363,  0.0656,  0.2169, -0.1024]], requires_grad=True),
 'bias': Parameter containing:
 tensor([-0.1324, -0.4426,  0.3663,  0.3982], requires_grad=True)}

## Logging

In [60]:
import logging

In [72]:
log = logging.Logger("a")

In [79]:
log.warning("b")

b


## 读取model类名称

In [81]:
model.__class__.__name__

'Linear'

## os

In [83]:
import os

In [89]:
os.path.join("/data", "1.tar")

'/data/1.tar'

## discretize

In [108]:
c_len = 5
is_legal_pair = torch.triu(torch.ones((c_len, c_len)))
max_len = 4
is_legal_pair -= torch.triu(torch.ones((c_len, c_len)), diagonal=max_len)
is_legal_pair

tensor([[1., 1., 1., 1., 0.],
        [0., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1.],
        [0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 1.]])

In [127]:
is_legal_pair = torch.triu(torch.ones((c_len, c_len)), diagonal = 2)

In [128]:
p_joint = torch.randn((3, 5, 5))
p_joint *= is_legal_pair
p_joint

tensor([[[-0.0000,  0.0000, -1.2679,  1.3683,  0.3288],
         [ 0.0000, -0.0000, -0.0000, -0.3547,  1.1280],
         [-0.0000, -0.0000,  0.0000, -0.0000,  1.1315],
         [-0.0000,  0.0000,  0.0000, -0.0000, -0.0000],
         [-0.0000,  0.0000, -0.0000,  0.0000, -0.0000]],

        [[-0.0000,  0.0000, -0.3493,  0.3470, -1.1746],
         [ 0.0000,  0.0000,  0.0000, -1.5943, -0.3677],
         [ 0.0000, -0.0000,  0.0000, -0.0000, -2.3150],
         [-0.0000, -0.0000,  0.0000, -0.0000,  0.0000],
         [-0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 0.0000,  0.0000,  1.5740,  0.0476,  0.5404],
         [ 0.0000,  0.0000,  0.0000, -0.5221,  0.6724],
         [-0.0000,  0.0000, -0.0000,  0.0000, -0.1659],
         [-0.0000,  0.0000,  0.0000, -0.0000, -0.0000],
         [ 0.0000, -0.0000, -0.0000,  0.0000,  0.0000]]])

In [138]:
max_in_row = torch.max(p_joint, dim = 2)[0]
max_in_col = torch.max(p_joint, dim = 1)[0]

In [139]:
torch.argmax(max_in_col, dim=-1)

tensor([3, 3, 2])

In [143]:
max_in_col

tensor([[-0.0000, 0.0000, -0.0000, 1.3683, 1.1315],
        [-0.0000, 0.0000, 0.0000, 0.3470, 0.0000],
        [0.0000, -0.0000, 1.5740, 0.0476, 0.6724]])

In [142]:
max_prob, _ = torch.max(max_in_col, dim=-1)
max_prob

tensor([1.3683, 0.3470, 1.5740])

In [146]:
'str asv'.split()

['str', 'asv']

## char_embedding layer

In [4]:
import json

In [8]:
with open("data/char2idx.json") as fr:
    char_emb = json.load(fr)

In [12]:
len(char_emb)

1376

In [20]:
char_vectors = util.torch_from_json("data/char_emb.json")

In [22]:
char_vectors.size()

torch.Size([1376, 64])

## word_vectors

In [15]:
import util
word_vectors = util.torch_from_json('./data/word_emb.json')

In [17]:
word_vectors.size()

torch.Size([88714, 300])

## masked softmax

In [11]:
# cw_idxs: (batch_size, c_len, word_dim)
# cc_idxs: (batch_size, c_len, word_len, char_dim)
(batch_size, c_len, word_dim) = 5, 10, 15
cw_idxs = torch.randn((batch_size, c_len, word_dim))
c_mask = torch.zeros_like(cw_idxs) != cw_idxs
c_mask.size()

torch.Size([5, 10, 15])