In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [28]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from dltime.data.ts_datasets import get_max_seq_len, tsMinMaxNormlizer, dataframe2ndarray, uniform_scaling
from sklearn.utils import shuffle
from sktime.datasets import load_UCR_UEA_dataset

In [3]:
def noise_mask(X, masking_ratio, lm=3, mode='separate', distribution='geometric', exclude_feats=None):
    """
    Creates a random boolean mask of the same shape as X, with 0s at places where a feature should be masked.
    Args:
        X: (seq_length, feat_dim) numpy array of features corresponding to a single sample
        masking_ratio: proportion of seq_length to be masked. At each time step, will also be the proportion of
            feat_dim that will be masked on average
        lm: average length of masking subsequences (streaks of 0s). Used only when `distribution` is 'geometric'.
        mode: whether each variable should be masked separately ('separate'), or all variables at a certain positions
            should be masked concurrently ('concurrent')
        distribution: whether each mask sequence element is sampled independently at random, or whether
            sampling follows a markov chain (and thus is stateful), resulting in geometric distributions of
            masked squences of a desired mean length `lm`
        exclude_feats: iterable of indices corresponding to features to be excluded from masking (i.e. to remain all 1s)

    Returns:
        boolean numpy array with the same shape as X, with 0s at places where a feature should be masked
    """
    if exclude_feats is not None:
        exclude_feats = set(exclude_feats)

    if distribution == 'geometric':  # stateful (Markov chain)
        if mode == 'separate':  # each variable (feature) is independent
            mask = np.ones(X.shape, dtype=bool)
            for m in range(X.shape[1]):  # feature dimension
                if exclude_feats is None or m not in exclude_feats:
                    mask[:, m] = geom_noise_mask_single(X.shape[0], lm, masking_ratio)  # time dimension
        else:  # replicate across feature dimension (mask all variables at the same positions concurrently)
            mask = np.tile(np.expand_dims(geom_noise_mask_single(X.shape[0], lm, masking_ratio), 1), X.shape[1])
    else:  # each position is independent Bernoulli with p = 1 - masking_ratio
        if mode == 'separate':
            mask = np.random.choice(np.array([True, False]), size=X.shape, replace=True,
                                    p=(1 - masking_ratio, masking_ratio))
        else:
            mask = np.tile(np.random.choice(np.array([True, False]), size=(X.shape[0], 1), replace=True,
                                            p=(1 - masking_ratio, masking_ratio)), X.shape[1])

    return mask

def geom_noise_mask_single(L, lm, masking_ratio):
    """
    Randomly create a boolean mask of length `L`, consisting of subsequences of average length lm, masking with 0s a `masking_ratio`
    proportion of the sequence L. The length of masking subsequences and intervals follow a geometric distribution.
    Args:
        L: length of mask and sequence to be masked
        lm: average length of masking subsequences (streaks of 0s)
        masking_ratio: proportion of L to be masked

    Returns:
        (L,) boolean numpy array intended to mask ('drop') with 0s a sequence of length L
    """
    keep_mask = np.ones(L, dtype=bool)
    p_m = 1 / lm  # probability of each masking sequence stopping. parameter of geometric distribution.
    p_u = p_m * masking_ratio / (1 - masking_ratio)  # probability of each unmasked sequence stopping. parameter of geometric distribution.
    p = [p_m, p_u]

    # Start in state 0 with masking_ratio probability
    state = int(np.random.rand() > masking_ratio)  # state 0 means masking, 1 means not masking
    for i in range(L):
        keep_mask[i] = state  # here it happens that state and masking value corresponding to state are identical
        if np.random.rand() < p[state]:
            state = 1 - state

    return keep_mask

In [4]:
def get_total_mask_len(seq):
    return np.sum(~seq)

def get_each_mask_len(seq):
    s_len = 0
    ans = []
    for i, s in enumerate(seq):
        if not s and not s_len:
            s_len = 1
        elif not s and s_len:
            s_len += 1
            if i == len(seq) -1:
                ans.append(s_len)
        elif s and s_len:
            ans.append(s_len)
            s_len = 0
        else:
            s_len = 0
    
    return ans

In [6]:
test_mask = []
for _ in range(10000):
    test_mask.append(geom_noise_mask_single(64, 5, 0.2))

In [7]:
total_len = [get_total_mask_len(i) for i in test_mask]

In [8]:
np.mean(total_len)

12.7911

In [9]:
total_sep_len = []
for m in test_mask:
    total_sep_len += get_each_mask_len(m)

In [10]:
np.mean(total_sep_len)

4.77951486521951

# MLM Dataset Test

In [18]:
X = np.random.randn(256, 144)
lm_mask = torch.from_numpy(noise_mask(X, 0.2))
X_tensor = torch.from_numpy(X)

In [19]:
lm_mask

tensor([[False, False,  True,  ..., False,  True,  True],
        [False, False,  True,  ..., False,  True,  True],
        [ True, False,  True,  ..., False,  True,  True],
        ...,
        [ True, False,  True,  ...,  True,  True,  True],
        [ True, False,  True,  ..., False,  True, False],
        [False,  True,  True,  ..., False,  True,  True]])

In [13]:
X_tensor = torch.from_numpy(X)

In [20]:
X_tensor.masked_fill(~lm_mask, -1)

tensor([[-1.0000, -1.0000,  0.1825,  ..., -1.0000,  0.2916,  0.8913],
        [-1.0000, -1.0000, -0.1287,  ..., -1.0000,  0.5978,  0.6555],
        [-1.2103, -1.0000, -0.7047,  ..., -1.0000, -0.0790, -0.0375],
        ...,
        [-0.4750, -1.0000, -0.1126,  ..., -1.3651,  0.1387,  0.7237],
        [-0.6511, -1.0000, -2.0006,  ..., -1.0000, -0.6534, -1.0000],
        [-1.0000,  1.7302, -2.5641,  ..., -1.0000,  0.3111, -0.3192]],
       dtype=torch.float64)

In [51]:
def dataframe2ndarray(X):
    "X 是具体某一条数据, 而非整个数据集"
    # 1. 统计各维度的数据长度
    all_len = [len(x) for x in X]
    # print(all_len)
    max_len = max(all_len)

    # 2. 统一每一维度的数据长度
    _X = []
    for x in X:
        # print(x)
        # 2.1 如果有缺失值, 进行插补
        if x.isnull().any():
            x = x.interpolate(method='linear', limit_direction='both')

        # 2.2. 如果有维度间的数据长度不相等, 则填充到一致
        if len(x) < max_len:
            x = uniform_scaling(x, max_len)
        _X.append(x)
    _X = np.array(np.transpose(_X))

    return _X

In [127]:


class MLM_UCR_UEADataset(Dataset):
    "Torch Datasets for UCR/UEA archive"

    def __init__(self, name, split=None, pt_ratio=0.5, extract_path="ucr_uea_archive", \
        max_len=256, normalize=None, masking_ratio=0.2, lm=5, mode='separate', distribution='geometric'):

        assert split in ["train", "test", None]
        assert normalize in ["standard", "minmax", None]

        super().__init__()
        self.pt_ratio = pt_ratio
        self.normalize = normalize
        self.masking_ratio = masking_ratio
        self.lm = lm
        self.mode = mode
        self.distribution = distribution

        self.data, _ = load_UCR_UEA_dataset(name, split=split, return_X_y=True, \
            extract_path=extract_path) # x, y => Dataframe
        
        self.data = shuffle(self.data).reset_index(drop=True)
        self.data = self.data.iloc[: int(len(self.data)*self.pt_ratio)]

        self.max_len = max(max_len, get_max_seq_len(self.data) + 1) # 获取最大序列长度
        self.normalizer = tsMinMaxNormlizer(scale=(0.05, 0.95))
        self.normalizer.fit(self.data)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        X = self.data.iloc[idx].copy(deep=True)
        # print(X)
        X = dataframe2ndarray(X)    # dataframe 转 numpy 数组

        # 数据归一化, 均按维度进行归一化
        X = self.normalizer.transform(X)
        lm_mask = ~noise_mask(X, self.masking_ratio, self.lm, self.mode, self.distribution)

        cls = np.ones((1, X.shape[-1])) # [CLS]
        pad = np.zeros((self.max_len - X.shape[0] - 1, X.shape[-1])) # [PAD]
        X = np.concatenate([cls, X, pad], axis=0)

        # padding mask
        padding_mask = [0] + [0] * X.shape[0] + [1] * (self.max_len - X.shape[0] - 1)
        
        # lm_mask
        cls_mask = np.zeros((1, X.shape[-1]), dtype=np.bool) # [CLS]
        pad_mask = pad[:]
        lm_mask = torch.from_numpy(np.concatenate([cls_mask, lm_mask, pad_mask], axis=0)).bool()

        item = {"input": torch.from_numpy(X[:]).masked_fill(lm_mask, -1).float(), \
            "padding_mask": torch.tensor(padding_mask).bool(), 
            "output": torch.from_numpy(X[:]).float(), 
            "lm_mask": lm_mask}
        
        return item

In [128]:
dataset = MLM_UCR_UEADataset("ArticularyWordRecognition", split="train")

In [130]:
item = dataset[0]

In [87]:
from dltime.models.ts_transformer import TSTransformerEncoder, TSTransformerEncoderClassifier

In [101]:
feat_dim, max_len, d_model, n_heads, num_layers, dim_feedforward, num_classes = \
    9, 256, 256, 2, 2, 512, 3

In [102]:
encoder = TSTransformerEncoder(feat_dim, max_len, d_model, n_heads, num_layers, dim_feedforward)

In [103]:
torch.save(encoder.state_dict(), "encoder.pth")

In [163]:
state_dicts = torch.load("D:\\github\dltime-torch\\test_program\\outputs\\ArticularyWordRecognition_best.pth")

In [165]:
for name, parameter in state_dicts.items():
    print(name)

project_inp.weight
project_inp.bias
pos_enc.pe
transformer_encoder.layers.0.self_attn.in_proj_weight
transformer_encoder.layers.0.self_attn.in_proj_bias
transformer_encoder.layers.0.self_attn.out_proj.weight
transformer_encoder.layers.0.self_attn.out_proj.bias
transformer_encoder.layers.0.linear1.weight
transformer_encoder.layers.0.linear1.bias
transformer_encoder.layers.0.linear2.weight
transformer_encoder.layers.0.linear2.bias
transformer_encoder.layers.0.norm1.weight
transformer_encoder.layers.0.norm1.bias
transformer_encoder.layers.0.norm1.running_mean
transformer_encoder.layers.0.norm1.running_var
transformer_encoder.layers.0.norm1.num_batches_tracked
transformer_encoder.layers.0.norm2.weight
transformer_encoder.layers.0.norm2.bias
transformer_encoder.layers.0.norm2.running_mean
transformer_encoder.layers.0.norm2.running_var
transformer_encoder.layers.0.norm2.num_batches_tracked
transformer_encoder.layers.1.self_attn.in_proj_weight
transformer_encoder.layers.1.self_attn.in_proj_bi

In [122]:
clf = TSTransformerEncoderClassifier(feat_dim, max_len, d_model, n_heads, num_layers, dim_feedforward, num_classes)

In [107]:
clf.state_dict()

OrderedDict([('project_inp.weight',
              tensor([[ 0.2143, -0.2024, -0.0681,  ..., -0.2116, -0.2554,  0.0887],
                      [-0.2702, -0.2701,  0.2442,  ..., -0.2202, -0.0841,  0.0971],
                      [ 0.0418,  0.3299,  0.1042,  ..., -0.2521,  0.1486, -0.1352],
                      ...,
                      [-0.2112, -0.2896,  0.2867,  ...,  0.3285, -0.2288,  0.0094],
                      [ 0.0498, -0.0795, -0.3274,  ...,  0.2357,  0.2213,  0.1863],
                      [-0.0176,  0.3166,  0.1792,  ..., -0.2375,  0.0178,  0.1022]])),
             ('project_inp.bias',
              tensor([ 0.2545, -0.2221, -0.2877,  0.2663, -0.3261,  0.1302,  0.2638,  0.2455,
                      -0.2250, -0.2960, -0.0438,  0.2661, -0.3174, -0.0606,  0.1777,  0.1289,
                      -0.2315, -0.1064, -0.0415, -0.2213,  0.1464, -0.0947, -0.0763,  0.0291,
                       0.0359, -0.0048, -0.2813, -0.0156,  0.3182,  0.2095, -0.1552,  0.2296,
                    

In [109]:
pretrained_dict = {k: v for k, v in state_dicts.items() if 'output_layer' not in k}

In [111]:
model_dict = clf.state_dict()

In [112]:
model_dict.update(pretrained_dict)

In [113]:
model_dict

OrderedDict([('project_inp.weight',
              tensor([[ 0.0095, -0.3316, -0.1782,  ...,  0.2416, -0.0445, -0.2607],
                      [ 0.0117,  0.0191,  0.1113,  ..., -0.1261, -0.3249,  0.0184],
                      [-0.2678, -0.3241, -0.2310,  ..., -0.3159, -0.1026,  0.2919],
                      ...,
                      [-0.0139, -0.2165, -0.0465,  ...,  0.1054, -0.3085, -0.0795],
                      [-0.0894, -0.0648,  0.2919,  ...,  0.1727,  0.2195,  0.2586],
                      [ 0.0978, -0.1104,  0.0179,  ..., -0.1141,  0.0085,  0.0403]])),
             ('project_inp.bias',
              tensor([ 2.1350e-01,  2.8938e-02,  2.8948e-01, -1.2193e-01,  1.8135e-01,
                      -3.2278e-01, -3.1745e-01,  1.3326e-01,  2.4129e-01,  1.8440e-01,
                       1.2089e-01,  1.7494e-02, -2.6797e-01, -2.8569e-01, -1.9773e-01,
                       3.0811e-01,  1.5666e-01, -2.0352e-01, -1.5753e-01,  2.8303e-01,
                       8.4822e-02, -2.3827e-01, 

In [121]:
def load_pretrained_state_dict(model, pretrained_path):
    pretrained_dict = torch.load(pretrained_path)
    model_dict = model.state_dict()
    pretrained_dict = {k: v for k, v in pretrained_dict.items() if "output_layer" not in k}
    model_dict.update(pretrained_dict)
    model.load_state_dict(model_dict)

In [123]:
clf.state_dict()

OrderedDict([('project_inp.weight',
              tensor([[ 0.1815, -0.1073,  0.0854,  ..., -0.1529, -0.1937, -0.0024],
                      [-0.1686, -0.2337,  0.2132,  ..., -0.1468,  0.0405, -0.0729],
                      [ 0.3325,  0.1289,  0.2366,  ..., -0.2703,  0.0776,  0.2900],
                      ...,
                      [ 0.1362,  0.0593, -0.1576,  ..., -0.1716, -0.1800, -0.2180],
                      [ 0.0184, -0.1768,  0.2571,  ..., -0.2737,  0.1111,  0.1320],
                      [ 0.2260, -0.1276,  0.2376,  ..., -0.0249,  0.2203,  0.1971]])),
             ('project_inp.bias',
              tensor([-0.2726, -0.3189,  0.3102, -0.0716, -0.2488,  0.0096,  0.2774,  0.1458,
                       0.3149, -0.0868,  0.0448,  0.0731, -0.1720, -0.3274, -0.1242,  0.1232,
                      -0.1573,  0.0968, -0.1163, -0.1399, -0.1870, -0.1516, -0.3049,  0.3137,
                       0.3029,  0.0676, -0.2516,  0.3013, -0.2154, -0.2729, -0.1616, -0.2224,
                    

In [124]:
load_pretrained_state_dict(clf, "encoder.pth")

In [153]:
pred = torch.tensor([[0.2, 0.1, 0.3, 0.5, 0.1], [0.5, 0.7, 0.3, 0.8, 0.1]], requires_grad=True).cuda()
out = torch.tensor([[0.3, 0.1, 0.2, 0.8, 0.2], [0.5, 0.6, 0.4, 0.1, 0.2]], requires_grad=True).cuda()
criterion = torch.nn.MSELoss(reduction="none")

In [154]:
mask = torch.tensor([[True, True, False, False, False], [True, False, True, False, False]])

In [155]:
loss = criterion(pred, out)
avg_loss = torch.masked_select(loss.view(-1, 1), mask.view(-1, 1) == 1)

In [156]:
avg_loss.mean()

tensor(0.0050, device='cuda:0', grad_fn=<MeanBackward0>)

In [157]:
scaler = torch.cuda.amp.GradScaler(enabled=True)
scaler.scale(avg_loss).backward()

RuntimeError: grad can be implicitly created only for scalar outputs