In [None]:
import math
import re
from random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
import ast
import itertools
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

device = torch.device("cuda")

In [None]:
df = pd.read_csv("./out/demo_labelled_dataset.csv")

df

In [None]:
X = df["numeric_output"][:926]
y = df.results[:926]

X

In [None]:
X = X.apply(ast.literal_eval)

print(type(X))

In [None]:
numeric_series = X[:100]

# print(numeric_series)

# numeric_series = pd.Series([
#     ([1.0], [-12.0, 0.999772471]),
#     ([-11.9978384],[ 0.999544983]),
#     ([0.99909009], [-11.9913555, 0.99863536]),
#     ([-11.9870352],[ 0.998180794]),
#     ([0.993644133], [-11.9396045, 0.989123796]),
#     ([-11.9913555, 0.99863536], [-12.0, 0.999772471]),
#     ([-11.9396045],[ 0.999544983]),
#     ([0.99909009], [-11.9870352]),
#     ([-11.9870352],[ 0.998180794]),
#     ([0.993644133], [-11.9396045, 0.989123796])
# ])

# print(numeric_series)

word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}
vocab_size = len(word_dict)
id = len(word_dict)

for lst1, lst2 in numeric_series:
    for i, num in enumerate(lst1):
        token = str(num)
        if token not in word_dict:
            word_dict[token] = id
            id += 1
    for i, num in enumerate(lst2):
        token = str(num)
        if token not in word_dict:
            word_dict[token] = id
            id += 1

number_dict = {i: w for w, i in word_dict.items()}

tokenized_series = numeric_series.apply(lambda lst: [word_dict.get(str(num), num) for num in lst])

print(tokenized_series)

token_list = list()
for xsol, ysol in tokenized_series:
    x_tokens = []
    y_tokens = []
    for num in xsol:
        x_tokens.append(word_dict[str(num)])
    for num in ysol:
        y_tokens.append(word_dict[str(num)])
    tokens = [x_tokens, y_tokens]
    token_list.append(tokens)

In [None]:
max_length = max(len(lst) for tup in tokenized_series for lst in tup)
print(max_length)

In [None]:
word_dict

In [None]:
len(token_list)

In [None]:
batch_size = 6
max_pred = 5
n_layers = 6
n_heads = 12
d_model = 32
d_ff = 32 * 4
d_k = d_v = 2
n_segments = 6
vocab_size = 50000
maxlen = 1000

In [None]:
def make_batch():
    batch = []
    positive = negative = 0
    while positive != batch_size/2 or negative != batch_size/2:
        tokens_a_index, tokens_b_index= randrange(len(tokenized_series)), randrange(len(tokenized_series))
        tokens_a, tokens_b= token_list[tokens_a_index], token_list[tokens_b_index]

        input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]

        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        n_pred =  min(max_pred, max(1, int(round(len(input_ids) * 0.15))))

        cand_maked_pos = [i for i, token in enumerate(input_ids)
                          if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]
        shuffle(cand_maked_pos)
        masked_tokens, masked_pos = [], []
        for pos in cand_maked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            if random() < 0.8:
                input_ids[pos] = word_dict['[MASK]']
            elif random() < 0.5:
                index = randint(0, vocab_size - 1)
                input_ids[pos] = index

        n_pad = maxlen - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True])
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False])
            negative += 1
    return batch

In [None]:
def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)
    pad_attn_mask = pad_attn_mask.to(device)
    return pad_attn_mask.expand(batch_size, len_q, len_k)

In [None]:
def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

In [None]:
batch = make_batch()
print("done", type(batch))

In [None]:
def flatten_tokens(tokens):
    flattened_tokens = []
    for nums in tokens:
        if type(nums) is list:
            flattened_tokens.extend(nums)
        else:
            flattened_tokens.append(nums)

    return flattened_tokens

In [None]:
def arrange_tokens(tokens):
    arranged = []
    for nums in tokens:
        flattened = flatten_tokens(nums)
        length = len(flattened)
        if length < max_length:
            missing = max_length - length
            flattened.extend([0] * missing)
        elif length > max_length:
            flattened = flattened[:max_length]
        arranged.append(flattened)

    return arranged

In [None]:
# input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

input_ids = torch.LongTensor(arrange_tokens([nums[0] for nums in batch]))
segment_ids = torch.LongTensor(arrange_tokens([nums[1] for nums in batch]))
masked_tokens = torch.LongTensor(arrange_tokens([nums[2] for nums in batch]))
masked_pos = torch.LongTensor(arrange_tokens([nums[3] for nums in batch]))
isNext = torch.LongTensor([nums[4] for nums in batch])

input_ids = input_ids.to(device)
segment_ids = segment_ids.to(device)
masked_tokens = masked_tokens.to(device)
masked_pos = masked_pos.to(device)
isNext = isNext.to(device)

In [None]:
get_attn_pad_mask(input_ids, input_ids)[0][0], input_ids[0], segment_ids[0], masked_tokens[0], masked_pos[0], isNext[0]

In [None]:
class Embedding(nn.Module):
    def __init__(self):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = nn.Embedding(maxlen, d_model)
        self.seg_embed = nn.Embedding(n_segments, d_model)
        self.norm = nn.LayerNorm(d_model)
        # print("tok_embed:", self.tok_embed.embedding_dim, self.tok_embed.num_embeddings)
        # print("pos_embed:",  self.pos_embed.embedding_dim, self.pos_embed.num_embeddings)
        # print("seg_embed:", self.seg_embed.embedding_dim, self.seg_embed.num_embeddings)

    def forward(self, x, seg):
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x)
        # print("x:", x.size())
        # print("pos:", pos.size())
        # print("seg:", seg.size())
        x = x.to(device)
        pos = pos.to(device)
        seg = seg.to(device)
        embedding = self.tok_embed(x) 
        embedding += self.pos_embed(pos)
        embedding += self.seg_embed(seg)
        embedding = embedding.to(device)
        return self.norm(embedding)

In [None]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k)
        scores.masked_fill_(attn_mask, -1e9)
        scores = scores.to(device)
        attn = nn.Softmax(dim=-1)(scores)
        attn = attn.to(device)
        context = torch.matmul(attn, V)
        context = context.to(device)
        return scores, context, attn

In [None]:
emb = Embedding()
emb = emb.to(device)
input_ids = input_ids.to(device)
segment_ids = segment_ids.to(device)
embeds = emb(input_ids, segment_ids)

print(embeds.shape)

attenM = get_attn_pad_mask(input_ids, input_ids)

attenM = attenM.to(device)

SDPA= ScaledDotProductAttention()(embeds, embeds, embeds, attenM)

S, C, A = SDPA

S = S.to(device)
C = C.to(device)
A = A.to(device)


print('Masks:', attenM[0][0])
print()
print('Scores:', S[0][0],'\n\nAttention M:', A[0][0])

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
    def forward(self, Q, K, V, attn_mask):
        Q = Q.to(device)
        K = K.to(device)
        V = V.to(device)
        attn_mask = attn_mask.to(device)

        residual, batch_size = Q, Q.size(0)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)

        _, context, attn = ScaledDotProductAttention().to(device)(q_s, k_s, v_s, attn_mask)
        # _, context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        attn = attn.to(device)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v)
        context = context.to(device)
        output = nn.Linear(n_heads * d_v, d_model).to(device)(context)
        # output = nn.Linear(n_heads * d_v, d_model)(context)
        return nn.LayerNorm(d_model).to(device)(output + residual), attn
        # return nn.LayerNorm(d_model)(output + residual), attn


In [None]:
emb = Embedding()
emb = emb.to(device)
input_ids = input_ids.to(device)
segment_ids = segment_ids.to(device)
embeds = emb(input_ids, segment_ids)
embeds = embeds.to(device)

attenM = get_attn_pad_mask(input_ids, input_ids)
attenM = attenM.to(device)

MHA= MultiHeadAttention().to(device)(embeds, embeds, embeds, attenM)
# MHA= MultiHeadAttention()(embeds, embeds, embeds, attenM)

Output, A = MHA
Output = Output.to(device)
A = A.to(device)

A[0][0]

In [None]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.fc2(gelu(self.fc1(x)))


In [None]:
class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)
        attn = attn.to(device)
        enc_outputs = self.pos_ffn(enc_outputs)
        enc_outputs = enc_outputs.to(device)
        return enc_outputs, attn

In [None]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.embedding = Embedding()
        self.embedding = self.embedding.to(device)
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        self.fc = nn.Linear(d_model, d_model)
        self.activ1 = nn.Tanh()
        self.linear = nn.Linear(d_model, d_model)
        self.activ2 = gelu
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, 2)
        embed_weight = self.embedding.tok_embed.weight
        embed_weight.to(device)
        n_vocab, n_dim = embed_weight.size()
        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
        self.decoder.weight = embed_weight
        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))

    def forward(self, input_ids, segment_ids, masked_pos):
        input_ids = input_ids.to(device)
        segment_ids = segment_ids.to(device)
        masked_pos = masked_pos.to(device)

        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids).to(device)
        # enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)
        h_pooled = self.activ1(self.fc(output[:, 0]))
        logits_clsf = self.classifier(h_pooled)

        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1))

        h_masked = torch.gather(output, 1, masked_pos)
        h_masked = self.norm(self.activ2(self.linear(h_masked)))
        logits_lm = self.decoder(h_masked) + self.decoder_bias

        return logits_lm, logits_clsf

In [None]:
model = BERT()

model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

batch = make_batch()
# input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))
input_ids = torch.LongTensor(arrange_tokens([nums[0] for nums in batch]))
segment_ids = torch.LongTensor(arrange_tokens([nums[1] for nums in batch]))
masked_tokens = torch.LongTensor(arrange_tokens([nums[2] for nums in batch]))
masked_pos = torch.LongTensor(arrange_tokens([nums[3] for nums in batch]))
isNext = torch.LongTensor([nums[4] for nums in batch])

input_ids = input_ids.to(device)
segment_ids = segment_ids.to(device)
masked_tokens = masked_tokens.to(device)
masked_pos = masked_pos.to(device)
isNext = isNext.to(device)

for epoch in range(1000):
    optimizer.zero_grad()
    logits_lm, logits_clsf = model(input_ids.to(device), segment_ids.to(device), masked_pos.to(device))
    # logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
    loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens)
    loss_lm = (loss_lm.float()).mean()
    loss_clsf = criterion(logits_clsf, isNext)
    loss = loss_lm + loss_clsf
    if (epoch + 1) % 10 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))
    loss.backward()
    optimizer.step()

In [None]:
# input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[0]))
# print(text)
# print([number_dict[w.item()] for w in input_ids[0] if number_dict[w.item()] != '[PAD]'])

# input_ids = input_ids.to(device)
# segment_ids = segment_ids.to(device)
# masked_tokens = masked_tokens.to(device)
# masked_pos = masked_pos.to(device)
# isNext = isNext.to(device)

# logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
# logits_lm = logits_lm.data.max(2)[1][0].data.cpu().numpy()
# print('masked tokens list : ',[pos.item() for pos in masked_tokens[0] if pos.item() != 0])
# print('predict masked tokens list : ',[pos for pos in logits_lm if pos != 0])

# logits_clsf = logits_clsf.data.max(1)[1].data.cpu().numpy()[0]
# print('isNext : ', True if isNext else False)
# print('predict isNext : ',True if logits_clsf else False)