In [247]:
"""
Chapter 4.6 Practice for Motion Analysis
"""


'\nChapter 4.6 Practice for Motion Analysis\n'

In [248]:
"""
4.6.1 Vocabulary Mapping

Map (input/Token) = INT(also index, subscript)

"""

# Bidirectional Mapping between tokens and index

from collections import defaultdict
class Vocab:
    def __init__(self, tokens = None):
        self.idx_to_token = list()
        self.token_to_idx = dict()

        if tokens is not None:
            if '<unk>' not in tokens:
                tokens = tokens + ['<unk>']
            for token in tokens:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token)-1
            self.unk = self.token_to_idx['<unk>']
    @classmethod
    def build(cls,text, min_freq=1, reserved_tokens=None):
        token_freqs = defaultdict(int)
        for sentence in text:
            for token in sentence:
                token_freqs[token] += 1
        uniq_tokens = ['<unk>'] + (reserved_tokens if reserved_tokens else [])
        uniq_tokens += [token for token, freq in token_freqs.items()\
                        if freq >= min_freq and token_freqs != '<unk>']
        return cls(uniq_tokens)

    def __len__(self):
        # return size of vocabulary, different tokens
        return len(self.idx_to_token)

    def __getitem__(self, token):
        # search index of input token
        # if not exist, then return the index of token <unk> (0)
        return self.token_to_idx.get(token,self.unk)

    def convert_tokens_to_ids(self,tokens):
        # search indexes of a series of input token
        return [self[token] for token in tokens]

    def convert_ids_to_tokens(self,indices):
        # search tokens of a series of indexes
        return [self.idx_to_token[index] for index in indices]


In [249]:
"""
4.6.2 Word Vector Layer

To convert a word(token) to a low-dimension, dense, continuous word vectors
also known as Embedding

torch.nn.Embedding()
Parameters:
num_embeddings: size of Vocabulary
embedding_dim: dimension of Embedding vector

Output:
Map(int for int in INT_Tensor) = Tensor(dim=embedding_dim)
use Vocabulary mapping function to get integer of token

"""
import  torch
from  torch import  nn
embedding = nn.Embedding(8, 3)
input = torch.tensor([[0, 1, 2, 1], [4, 6, 6, 7]],dtype=torch.long)
output = embedding(input)
print(output)
print(output.shape)

tensor([[[-1.1099, -0.8485, -0.6749],
         [ 0.1831,  0.1613,  1.7247],
         [ 0.7548, -2.0131, -0.6015],
         [ 0.1831,  0.1613,  1.7247]],

        [[-0.5198,  0.3109,  0.2871],
         [-0.2369, -1.3768,  1.5595],
         [-0.2369, -1.3768,  1.5595],
         [-2.4886,  0.2546,  0.1521]]], grad_fn=<EmbeddingBackward0>)
torch.Size([2, 4, 3])


In [250]:
"""
4.6.3 Multilayer Conceptor combined word vector layer
"""

import  torch
from  torch import  nn
from torch.nn import  functional as F

class MLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
        # Initial
        super(MLP, self).__init__()
        # Word Vector Layer
        self.embedding = nn.Embedding(vocab_size,embedding_dim)
        # Linear Operation: Input Layer -- Hidden Layer
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        # Activation Function
        self.activate = F.relu
        # Linear Operation: Hidden Layer -- Output Layer
        self.linear2 = nn.Linear(hidden_dim,num_class)
    def forward(self, inputs,offsets):
        embeddings = self.embedding(inputs,offsets)
        # converge Embeddings in series
        # Here we compute the average value
        embedding = embeddings.mean(dim=1)
        hidden = self.activate(self.linear1(embedding))
        outputs = self.linear2(hidden)
        # Get log value of every series is predicted to a specific class
        probs = F.softmax(outputs,dim=1)
        return probs
mlp = MLP(vocab_size=8, embedding_dim= 3, hidden_dim= 5, num_class=2)
# inputs are two int series with length of 4
inputs = torch.tensor([[0,1,2,1],[4,6,6,7]],dtype=torch.long)
outputs = mlp(inputs)
print(outputs)

TypeError: forward() missing 1 required positional argument: 'offsets'

In [None]:
"""
Embedding Bagging
"""
import  torch
from  torch import  nn
input1 = torch.tensor([0,1,2,1],dtype=torch.long)
input2 = torch.tensor([2,1,3,7,5], dtype= torch.long)
input3 = torch.tensor([6,4,2], dtype=torch.long)
inputs = [input1,input2,input3]
offsets = [0] + [i.shape[0] for i in inputs]
print(offsets)

offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
print(offsets)

inputs = torch.cat(inputs)
print(inputs)

embeddingbag = nn.EmbeddingBag(num_embeddings=8,embedding_dim=3)
embeddings = embeddingbag(inputs,offsets)
print(embeddings)

In [None]:
"""
4.6.4 Data Processing
"""

def load_sentence_polarity():
    from nltk.corpus import  sentence_polarity

    # build a vocabulary using set within all sentence set
    vocab = Vocab.build(sentence_polarity.sents())
    train_data = [(vocab.convert_tokens_to_ids(sentence),0) for sentence in sentence_polarity.sents(categories='pos')
    [:4000]] \
        + [(vocab.convert_tokens_to_ids(sentence),1) for sentence in sentence_polarity.sents(categories='neg')[:4000]]

    test_data = [(vocab.convert_tokens_to_ids(sentence),0) for sentence in sentence_polarity.sents(categories='pos')
    [4000:]] \
        + [(vocab.convert_tokens_to_ids(sentence),1) for sentence in sentence_polarity.sents(categories='neg')[4000:]]

    return train_data,test_data,vocab

from torch.utils.data import  DataLoader
from torch.utils.data import  Dataset

"""
Create a subclass BowDataset of class Dataset
"""

class BowDataset(Dataset):
    def __init__(self,data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, i):
        return self.data[i]


def collate_fn(examples):
    inputs = [torch.tensor(ex[0]) for ex in examples]
    # output targets is Tensor consists of outputs of all instances(0 or 1)
    targets = torch.tensor([ex[1] for ex in examples], dtype= torch.long)
    # get series length of every instance in a batch
    offsets = [0] + [i.shape[0] for i in inputs]
    # convert offset to be start position of every series according to length of series
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    # concatenate Tensors in input list to be a big Tensor
    inputs = torch.cat(inputs)
    return inputs, offsets, targets

data_loader = DataLoader(
    dataset = BowDataset(Dataset),
    batch_size=64,
    collate_fn= collate_fn,
    shuffle= False
)

In [255]:
"""
4.6.5 Multilayer Perceptron Training & Testing
"""

from tqdm.auto import tqdm
from torch import optim

class MLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
        super(MLP, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        self.activate = F.relu
        self.linear2 = nn.Linear(hidden_dim, num_class)
    def forward(self, inputs, offsets):
        embedding = self.embedding(inputs, offsets)
        hidden = self.activate(self.linear1(embedding))
        outputs = self.linear2(hidden)
        log_probs = F.log_softmax(outputs, dim=1)
        return log_probs

# Hyper Parameter
embedding_dim = 128
hidden_dim = 256
num_class = 2
batch_size = 32
num_epoch  = 5

# Data Loading
train_data, test_data, vocab = load_sentence_polarity()
train_dataset = BowDataset(train_data)
test_dataset = BowDataset(test_data)
train_data_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_data_loader = DataLoader(dataset=test_dataset, batch_size=1,collate_fn= collate_fn, shuffle=True)

# Model Loading
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MLP(len(vocab), embedding_dim, hidden_dim, num_class)
model.to(device)

# Training Process
nll_loss = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(),lr = 0.001)
model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader,desc=f'Training Epoch {epoch}'):
        inputs,offsets,targets = [x.to(device) for x in batch]
        log_probs = model(inputs,offsets)
        loss = nll_loss(log_probs, targets)
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss: .2f}")

# Testing Process
acc = 0
for batch in tqdm( test_data_loader, desc=f'Testing'):
    inputs, offsets, targets = [x.to(device) for x in batch]
    with torch.no_grad():
        output = model(inputs, offsets)
        acc += (output.argmax(dim=1) == targets).sum().item()
        print(acc)

# Output accuracy on test set
print(f'Acc: {acc/len(test_data_loader):.2f}')

Training Epoch 0:   0%|          | 0/250 [00:00<?, ?it/s]

Loss:  173.27


Training Epoch 1:   0%|          | 0/250 [00:00<?, ?it/s]

Loss:  173.27


Training Epoch 2:   0%|          | 0/250 [00:00<?, ?it/s]

Loss:  173.27


Training Epoch 3:   0%|          | 0/250 [00:00<?, ?it/s]

Loss:  173.27


Training Epoch 4:   0%|          | 0/250 [00:00<?, ?it/s]

Loss:  173.27


Testing:   0%|          | 0/2662 [00:00<?, ?it/s]

0
1
1
2
2
3
3
3
4
4
5
5
6
7
8
8
8
8
8
8
8
9
9
10
11
12
12
12
13
13
14
14
15
15
16
16
17
17
17
18
18
18
18
18
19
19
19
19
20
20
21
22
23
23
23
23
23
23
24
24
25
25
25
26
27
27
28
29
30
30
31
31
31
32
33
33
34
34
34
34
34
35
36
37
37
37
37
38
38
39
39
39
40
41
41
42
43
43
43
44
44
45
46
46
47
48
48
48
48
48
49
49
49
50
50
51
52
53
53
53
54
54
55
56
56
57
57
57
58
58
59
59
60
61
61
61
61
62
63
63
63
64
64
65
65
65
66
67
67
68
69
69
70
71
71
72
73
74
74
75
75
75
76
76
76
77
77
78
79
80
81
82
82
82
82
82
83
83
84
85
85
86
86
87
87
87
87
88
88
89
90
91
92
92
93
94
95
96
96
97
97
97
98
98
99
99
100
101
102
102
103
103
104
105
106
107
107
108
108
109
109
110
111
111
112
113
113
114
115
115
116
117
118
119
119
120
121
122
123
124
125
126
126
126
127
127
128
128
128
129
129
129
129
129
130
131
132
133
134
135
136
136
137
138
138
138
139
139
140
140
140
140
140
140
141
141
142
143
143
143
144
144
145
145
145
146
147
147
148
148
149
150
151
151
152
153
153
153
154
155
156
157
158
159
159
159
160
1