<h3>Importing libraries</h3>

In [1]:
import os
from tqdm.auto import tqdm
import re
import pickle
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch
from parameters import parse_args
from graph_utils import get_graph_data
from data_generation_utils import get_kfold_lm_data
from transformers import AutoTokenizer

from nltk.tokenize import word_tokenize

import sys; sys.argv=['']; del sys

<h4> Getting the data </h4>

In [2]:
args = parse_args()
data_dir = args.data_dir
args.graphs_file = os.path.join(data_dir, args.graphs_file)


graph_data = get_graph_data(args.graphs_file)
label_map, super_type_map = graph_data['entities_encoder'], graph_data['super_types_encoder']
inverse_label_map = {v: k for k, v in label_map.items()}
inverse_super_type_map = {v: k for k, v in super_type_map.items()}


In [3]:
label_map, super_type_map = graph_data['entities_encoder'], graph_data['super_types_encoder']
for i, data in enumerate(get_kfold_lm_data(graph_data, seed=args.seed)):
    break

99556 11062 6847


In [4]:
SSP = "<superType>"
ESP = "</superType>"
SEN = "<entity>"
EEN = "</entity>"

SRP = "<relations>"
ERP = "</relations>"

PAD = "<pad>"
UNK = "<unk>"
SOS = "<s>"
EOS = "</s>"
MASK = "<mask>"
SEP = "<sep>"

SPECIAL_TOKENS = [PAD, UNK, SOS, EOS, MASK, SEP, SSP, ESP, SEN, EEN, SRP, ERP]

clean_text = lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x).strip()

promptize_triple = lambda x: f"{SOS} {SSP} {clean_text(x[2])} {ESP} {SEN} {clean_text(x[0])} {EEN} {SRP} {clean_text(x[1])} {ERP} {EOS}"

def promptize_super_type_generation(x):
    return f"{SOS} {SEN} {clean_text(x[0])} {EEN} {SRP} {clean_text(x[1])} {ERP} {SEP} {SSP} {clean_text(x[2])} {ESP} {EOS}"

def promptize_entity_type_generation(x):
    return f"{SOS} {SSP} {clean_text(x[1])} {ESP} {SRP} {clean_text(x[1])} {ERP} {SEP} {SEN} {clean_text(x[0])} {EEN} {EOS}"

def promptize_super_type_classification(x):
    return f"{SOS} {SEN} {clean_text(x[0])} {EEN} {SRP} {clean_text(x[1])} {ERP} {EOS}", f"{clean_text(x[2])}".split()

def promptize_entity_type_classification(x):
    return f"{SOS} {SSP} {clean_text(x[1])} {ESP} {SRP} {clean_text(x[1])} {ERP} {EOS}", f"{clean_text(x[0])}"

In [5]:
def remove_duplicates(data):
    return list({str(i): i for i in data}.values())

def print_sample_data(data):
    for split_type in data:
        print(f"Split type: {split_type}")
        print(f"Total number of samples: {len(data[split_type])}")
        print(f"2 Samples: {data[split_type][:2]}")
        print()


def get_promptized_data_for_super_type_generation(data):
    promptized_data = {
        split_type: remove_duplicates([promptize_super_type_generation(i) for i in data[split_type] if len(i[2].strip())])\
              for split_type in data
    }
    # print_sample_data(promptized_data)
    
    return promptized_data

def get_promptized_data_for_entity_generation(data):
    promptized_data = {
        split_type: remove_duplicates([promptize_entity_type_generation(i) for i in data[split_type] if len(i[1].strip())])\
              for split_type in data
    }
    # print_sample_data(promptized_data)
    return promptized_data

def get_promptized_data_for_super_type_classification(data):
    promptized_data = {
        split_type: remove_duplicates([promptize_super_type_classification(i) for i in data[split_type] if len(i[2].strip())])\
              for split_type in data
    }
    print_sample_data(promptized_data)
    
    return promptized_data

def get_promptized_data_for_entity_classification(data):
    promptized_data = {
        split_type: remove_duplicates([promptize_entity_type_classification(i) for i in data[split_type] if len(i[1].strip())])\
              for split_type in data
    }
    print_sample_data(promptized_data)
    return promptized_data

def get_promptized_data_for_generation(data):
    data_for_super_type_generation = get_promptized_data_for_super_type_generation(data)
    data_for_entity_generation = get_promptized_data_for_entity_generation(data)

    promptized_data = {
        split_type: data_for_super_type_generation[split_type] + data_for_entity_generation[split_type]\
              for split_type in data
    }
    print_sample_data(promptized_data)
    
    return promptized_data

In [6]:
def get_data_for_classification(data, class_type='super'):
    if class_type == 'super':
        promptized_data = get_promptized_data_for_super_type_classification(data)
    else:
        promptized_data = get_promptized_data_for_entity_classification(data)
    return promptized_data


In [7]:
print("Promptize for data generation")
promptized_data = get_promptized_data_for_generation(data)

print()
print("Promptize for super type classification")
data_for_super_type_classification = get_promptized_data_for_super_type_classification(data)

print()
print("Promptize for entity classification")
data_for_entity_classification = get_promptized_data_for_entity_classification(data)

Promptize for data generation
Split type: train
Total number of samples: 119493
2 Samples: ['<s> <entity> CFlowPointCut </entity> <relations>  </relations> <sep> <superType> PointCut PointCutPointCut </superType> </s>', '<s> <entity> IfcCartesianPoint </entity> <relations>  </relations> <sep> <superType> IfcLayeredItem IfcGeometricRepresentationItem IfcPointOrVertexPoint IfcPoint IfcGeometricSetSelect </superType> </s>']

Split type: test
Total number of samples: 13708
2 Samples: ['<s> <entity> Contributioncoauthor </entity> <relations>  </relations> <sep> <superType> Thing Regularauthor </superType> </s>', '<s> <entity> Roadmap </entity> <relations>  </relations> <sep> <superType> VariabilityElement DescribableElement Classifier Type Element </superType> </s>']

Split type: unseen
Total number of samples: 8618
2 Samples: ['<s> <entity> ConstantActivity </entity> <relations> value Value </relations> <sep> <superType> Activity </superType> </s>', '<s> <entity> AdaptationTask </entity> <

In [7]:
promptized_data['train'][0]

'<s> <entity> StateMachine </entity> <relations>  </relations> <sep> <superType> StructuredClassifier Classifier Type Namespace PackageableElement </superType> </s>'

<h4> Creating tokenizers </h4>

In [8]:
class VocabTokenizer:
    def __init__(self, data, lower=True, special_tokens=[]):
        self.lower = lower
        self.vocab = {}
        self.special_tokens = special_tokens
        
        for i in self.special_tokens:
            self.vocab[i] = len(self.vocab)

        for split_type in data:
            for i in data[split_type]:
                word = " ".join(i) if isinstance(i, tuple) else i
                for j in word_tokenize(clean_text(word) if not self.lower else clean_text(word).lower()):
                    if j not in self.vocab:
                        self.vocab[j] = len(self.vocab)
        
        self.pad_token_id = self.vocab[PAD]
        self.pad_token = PAD

        self.unknown_token_id = self.vocab[UNK]
        self.unknown_token = UNK
        
        self.index_to_key = {v: k for k, v in self.vocab.items()}
    
    def batch_encode(self, x, return_tensors=None, max_length=None):
        assert isinstance(x, list), "Input must be a list"
        batch_encodings = [self.encode(i) for i in tqdm(x, desc='Encoding')]
        lengths = [len(i) for i in batch_encodings]
        perc_max_length = int(np.percentile(lengths, 99.95))
        max_length = 512 if max_length is None else (perc_max_length if max_length == 'percentile' else max_length)
        max_length = min(max_length, max([len(i) for i in batch_encodings]))
        
        batch_input_ids = [i[:min(max_length, len(i))] + [self.pad_token_id] * (max_length - min(max_length, len(i))) for i in batch_encodings]
        batch_attention_mask = [[1] * min(max_length, len(i)) + [0] * (max_length - min(max_length, len(i))) for i in batch_encodings]

        if return_tensors == 'pt':
            return {
                'input_ids': torch.LongTensor(batch_input_ids),
                'attention_mask': torch.LongTensor(batch_attention_mask)
            }
        elif return_tensors == 'np':
            return {
                'input_ids': np.array(batch_input_ids),
                'attention_mask': np.array(batch_attention_mask)
            }
        else:
            return {
                'input_ids': batch_input_ids,
                'attention_mask': batch_attention_mask
            }


    def encode(self, x, return_tensors=None):
        input_ids = self(x)
        if return_tensors == 'pt':
            return torch.LongTensor(input_ids)
        elif return_tensors == 'np':
            return np.array(input_ids)
        return input_ids
    

    def __call__(self, x):
        if isinstance(x, tuple) or isinstance(x, list):
            x = " ".join(x)
        
        words, x = x.split(), list()
        for i in range(0, len(words)):
            if words[i] in self.special_tokens:
                x.append(words[i])
            else:
                x.extend(word_tokenize(clean_text(words[i]) if not self.lower else clean_text(words[i]).lower()))

        return [self.vocab.get(i, self.vocab['<unk>']) for i in x]
    
    def decode(self, x):
        assert isinstance(x, list), "Input must be a list"
        return [self.index_to_key[i] for i in x]
    
    def __len__(self):
        return len(self.vocab)
    
    def get_vocab(self):
        return self.vocab

    def add_special_tokens(self, special_tokens):
        for i in special_tokens:
            if i not in self.vocab:
                self.vocab[i] = len(self.vocab)
        self.index_to_key = {v: k for k, v in self.vocab.items()}
    
    def __str__(self) -> str:
        return f"VocabTokenizer(vocab_size={len(self.vocab)})"


In [9]:
def get_pretrained_lm_tokenizer(model_name, special_tokens=SPECIAL_TOKENS):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
    print("Vocab size: ", len(tokenizer))
    return tokenizer

def get_word_tokenizer_tokenizer(data, lower=True, special_tokens=SPECIAL_TOKENS):
    tokenizer = VocabTokenizer(data, lower=lower, special_tokens=special_tokens)
    print("Vocab size: ", len(tokenizer))
    return tokenizer

# Testing Tokenizer
# txt = data['train'][7]
# print(txt)
# print(vocab_tokenizer(txt))
# print(vocab_tokenizer.decode(vocab_tokenizer(txt)))

# txt = promtized_data['train'][7]
# print(txt)
# print(vocab_tokenizer(txt))
# print(vocab_tokenizer.decode(vocab_tokenizer(txt)))

# txt = promtized_data_for_super_type['train'][7]
# print(txt)
# print(vocab_tokenizer(txt))
# print(vocab_tokenizer.decode(vocab_tokenizer(txt)))


# txt = promtized_data_for_entity['train'][7]
# print(txt)
# print(vocab_tokenizer(txt))
# print(vocab_tokenizer.decode(vocab_tokenizer(txt)))


In [10]:
model_name = 'bert-base-cased'
vocab_tokenizer = get_word_tokenizer_tokenizer(promptized_data)
pretrained_lm_tokenizer = get_pretrained_lm_tokenizer(model_name)

Vocab size:  76838
Vocab size:  29008


<h3> Creating Dataset and DataLoaders </h3>

In [79]:
def get_super_type_labels(super_types, super_type_map, multi_label=False):
    stp_labels = [[super_type_map[j] for j in super_type] for super_type in super_types]
    if not multi_label:
        stp_labels = np.array([i[0] for i in stp_labels])
        stp_labels = torch.from_numpy(stp_labels)
    else:
        l = list()
        for stp_label in stp_labels:
            row = torch.zeros(len(super_type_map))
            for label in stp_label:
                row[label] = 1
            l.append(row)
            
        stp_labels = torch.stack(l)
        
    return stp_labels

def get_encoding_size(data, tokenizer):
    tokens = tokenizer(data)
    lengths = [len(i) for i in tokens['input_ids']]
    return int(np.percentile(lengths, 99.95))

In [12]:
class GenerativeUMLDataset(Dataset):
    def __init__(self, data, tokenizer):
        super().__init__()
        self.data = data
        
        if isinstance(tokenizer, VocabTokenizer):
            self.inputs = tokenizer.batch_encode(data, return_tensors='pt', max_length='percentile')
        else:
            max_token_length = get_encoding_size(data, tokenizer)
            self.inputs = tokenizer(data, padding=True, return_tensors='pt', max_length=max_token_length, truncation=True)
        self.labels = self.inputs['input_ids'].clone()
        self.labels[self.labels == tokenizer.pad_token_id] = -100

        print(self.labels[0].shape)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': self.labels[idx]
        }
    

class SuperTypeClassificationDataset(Dataset):
    def __init__(self, data, tokenizer, super_type_map, multi_label=False):
        super().__init__()
        self.data = data
        super_type_inputs = [i[0] for i in data]
        super_type_labels = [i[1] for i in data]
        if isinstance(tokenizer, VocabTokenizer):
            self.inputs = tokenizer.batch_encode(super_type_inputs, return_tensors='pt', max_length='percentile')
        else:
            max_token_length = get_encoding_size(super_type_inputs, tokenizer)
            self.inputs = tokenizer(super_type_inputs, padding=True, return_tensors='pt', max_length=max_token_length, truncation=True)
        
        self.labels = get_super_type_labels(super_type_labels, super_type_map, multi_label=multi_label)
        self.i2c = {v: k for k, v in super_type_map.items()}
        self.multi_label = multi_label
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': self.labels[idx]
        }


class EntityClassificationDataset(Dataset):
    def __init__(self, data, tokenizer, label_map):
        super().__init__()
        self.data = data
        entity_inputs = [i[0] for i in data]
        entity_labels = [i[1] for i in data]
        if isinstance(tokenizer, VocabTokenizer):
            self.inputs = tokenizer.batch_encode(entity_inputs, return_tensors='pt', max_length='percentile')
        else:
            max_token_length = get_encoding_size(entity_inputs, tokenizer)
            self.inputs = tokenizer(entity_inputs, padding=True, return_tensors='pt', max_length=max_token_length, truncation=True)
        self.labels = [label_map[i] for i in entity_labels]
        self.i2c = {v: k for k, v in label_map.items()}
        
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': self.labels[idx]
        }

In [91]:
def get_generative_uml_dataset(data, tokenizer):
    dataset = {
        split_type: GenerativeUMLDataset(data[split_type][:100], tokenizer) for split_type in data
    }
    return dataset

def get_super_type_classification_dataset(data, tokenizer, super_type_map, multi_label=False):
    dataset = {
        split_type: SuperTypeClassificationDataset(
            data[split_type][:100], tokenizer, super_type_map, multi_label=multi_label) for split_type in data
    }
    return dataset

def get_entity_classification_dataset(data, tokenizer, label_map):
    dataset = {
        split_type: EntityClassificationDataset(
            data[split_type][:100], tokenizer, label_map) for split_type in data
    }
    return dataset

In [109]:
def get_classification_dataset(data, tokenizer, encoder, class_type='super', multi_label=False):
    if class_type == 'super':
        return get_super_type_classification_dataset(data, tokenizer, encoder, multi_label=multi_label)
    else:
        return get_entity_classification_dataset(data, tokenizer, encoder)

In [14]:
vocab_dataset = get_generative_uml_dataset(promptized_data, vocab_tokenizer)

Encoding:   0%|          | 0/100 [00:00<?, ?it/s]

torch.Size([20])


Encoding:   0%|          | 0/100 [00:00<?, ?it/s]

torch.Size([25])


Encoding:   0%|          | 0/100 [00:00<?, ?it/s]

torch.Size([22])


In [15]:
lm_dataset = get_generative_uml_dataset(promptized_data, pretrained_lm_tokenizer)

torch.Size([58])
torch.Size([98])
torch.Size([116])


In [16]:
lm_dataset['train'][0]

{'input_ids': tensor([  101, 28998, 29004,  1426,  2107, 19226,  1673, 29005, 29006, 29007,
         29001, 29002, 25341,  1181,  1658, 17223, 17792,  3699, 17792,  6902,
         13313, 12204, 14667,  2553,  1895,  2036, 20041, 29003, 28999,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor([  101, 28998, 29004,  1426,  2107, 19226,  1673, 29005, 29006, 29007,
         29001, 29002, 25341,  1181,  1658, 17223, 17792,  3699, 17792,  6902,
         13313, 12204, 14667,  2553,  1895,  2036, 20041, 29003, 28999,   102,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,

In [17]:
len(lm_dataset['train']), len(vocab_dataset['train'])

(100, 100)

In [18]:
def save_dataset(dataset, file_name):
    with open(file_name, 'wb') as f:
        pickle.dump(dataset, f)

def load_dataset(file_name):
    with open(file_name, 'rb') as f:
        dataset = pickle.load(f)
    return dataset

In [19]:
print(len(lm_dataset['train']), len(vocab_dataset['train']))
print(len(lm_dataset['test']), len(vocab_dataset['test']))
print(len(lm_dataset['unseen']), len(vocab_dataset['unseen']))

100 100
100 100
100 100


In [20]:
lm_dataset['train']

<__main__.GenerativeUMLDataset at 0x7fd525a4ce50>

In [21]:
data_for_super_type_classification['train'][0][1]

['StructuredClassifier',
 'Classifier',
 'Type',
 'Namespace',
 'PackageableElement']

In [22]:
super_types_map = {k: i for i, k in enumerate({j for _, data in data_for_super_type_classification.items() for sp in data for j in sp[1]})}
entity_map = {k: i for i, k in enumerate({en[1] for _, data in data_for_entity_classification.items() for en in data})}

In [23]:
vocab_super_classification_dataset = get_super_type_classification_dataset(
    data_for_super_type_classification, vocab_tokenizer, super_types_map)

lm_super_classification_dataset = get_super_type_classification_dataset(
    data_for_super_type_classification, pretrained_lm_tokenizer, super_types_map)

Encoding:   0%|          | 0/100 [00:00<?, ?it/s]

Encoding:   0%|          | 0/100 [00:00<?, ?it/s]

Encoding:   0%|          | 0/100 [00:00<?, ?it/s]

In [24]:
vocab_entity_classification_dataset = get_entity_classification_dataset(
    data_for_entity_classification, vocab_tokenizer, entity_map)

lm_entity_classification_dataset = get_entity_classification_dataset(
    data_for_entity_classification, pretrained_lm_tokenizer, entity_map)

Encoding:   0%|          | 0/100 [00:00<?, ?it/s]

Encoding:   0%|          | 0/100 [00:00<?, ?it/s]

Encoding:   0%|          | 0/100 [00:00<?, ?it/s]

In [25]:
print(len(vocab_super_classification_dataset['train']), len(lm_super_classification_dataset['train']))
print(len(vocab_super_classification_dataset['test']), len(lm_super_classification_dataset['test']))
print(len(vocab_super_classification_dataset['unseen']), len(lm_super_classification_dataset['unseen']))

print(len(vocab_entity_classification_dataset['train']), len(lm_entity_classification_dataset['train']))
print(len(vocab_entity_classification_dataset['test']), len(lm_entity_classification_dataset['test']))
print(len(vocab_entity_classification_dataset['unseen']), len(lm_entity_classification_dataset['unseen']))

100 100
100 100
100 100
100 100
100 100
100 100


<h4> Saving datasets </h4>

<h3> Training Language Models </h3>

<h4> Creating custom GPT </h4>

In [26]:
import torch
import torch.nn as nn
torch.manual_seed(42)

<torch._C.Generator at 0x7fd54f484b50>

In [191]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def weights_init(model):
    if isinstance(model, nn.Linear):
        nn.init.xavier_uniform_(model.weight.data)
        if model.bias is not None:
            nn.init.zeros_(model.bias.data)


class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, embed_dim, head_size, dropout=0.1):
        super().__init__()
        self.key = nn.Linear(embed_dim, head_size, bias=False)
        self.query = nn.Linear(embed_dim, head_size, bias=False)
        self.value = nn.Linear(embed_dim, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(head_size, head_size)))
        self.softmax = nn.Softmax(dim=-1)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, attention_mask):
        B, T, C = x.shape
        k = self.key(x)   # (B, T, C)
        q = self.query(x) # (B, T, C)

        # Compute attention scores ("affinities") only where the mask is non-zero
        wei = q @ k.transpose(-2, -1) * C**-0.5  # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill((attention_mask.unsqueeze(1) == 0), float('-inf'))  # (B, T, T)
        wei = self.softmax(wei)  # (B, T, T)
        wei = self.dropout(wei)

        # Perform the weighted aggregation of the values
        v = self.value(x)  # (B, T, C)
        out = wei @ v  # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, embed_dim, num_heads, dropout=0.1):
        super().__init__()
        head_size = embed_dim // num_heads
        self.heads = nn.ModuleList([Head(embed_dim, head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, attn_mask):
        out = torch.cat([h(x, attn_mask) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, input_dim, embed_dim=None, num_classes=None, dropout=0.1):
        super().__init__()
        if embed_dim is None:
            embed_dim = input_dim

        self.net = nn.Sequential(
            nn.Linear(input_dim, 4 * embed_dim),
            nn.ReLU(),
            nn.Linear(4 * embed_dim, embed_dim if num_classes is None else num_classes),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, embed_dim, n_head):
        # embed_dim: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        self.sa = MultiHeadAttention(embed_dim, n_head)
        self.ffwd = FeedFoward(embed_dim)
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)

    def forward(self, x, attn_mask):
        x = x + self.sa(self.ln1(x), attn_mask)
        x = x + self.ffwd(self.ln2(x))
        return x


class UMLGPT(nn.Module):

    def __init__(self, vocab_size, embed_dim, block_size, n_layer, n_head, load_pretrained_from=None):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table

        if load_pretrained_from is not None:
            self.load_pretrained(load_pretrained_from)
        else:
            self.token_embedding_table = nn.Embedding(vocab_size, embed_dim)
            self.position_embedding_table = nn.Embedding(block_size, embed_dim)
            self.blocks = nn.Sequential(*[Block(embed_dim, n_head) for _ in range(n_layer)])
            self.ln_f = nn.LayerNorm(embed_dim) # final layer norm
            self.lm_head = nn.Linear(embed_dim, vocab_size)

            self.apply(weights_init)


    def forward(self, x, attention_mask):
        embeddings = self.get_embedding(x, attention_mask)
        logits = self.lm_head(embeddings)
        return logits


    def get_loss(self, logits, labels, ignore_index=-100):
        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = nn.CrossEntropyLoss(ignore_index=ignore_index)
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        
        return loss
    
    def get_embedding(self, x, attention_mask):
        # x: [batch_size, seq_len]
        # attention_mask: [batch_size, seq_len]
        token_embeddings = self.token_embedding_table(x)
        position_ids = torch.arange(x.size(1), dtype=torch.long, device=x.device)
        position_ids = position_ids.unsqueeze(0).expand_as(x)
        position_embeddings = self.position_embedding_table(position_ids)
        embeddings = token_embeddings + position_embeddings

        # # Modify the forward pass to include src_key_padding_mask
        for block in self.blocks:
            embeddings = block(embeddings, attention_mask)

        embeddings = self.ln_f(embeddings)
        return embeddings


    def get_model_size(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)
    
    def __repr__(self):
        return super().__repr__() + f'\nNumber of parameters: {self.get_model_size() / 1000000:.3f}M'
    
    @staticmethod
    def from_pretrained(state_dict_pth):
        state_dict = torch.load(state_dict_pth, map_location=device)
        vocab_size, embed_dim = [s.shape for _, s in state_dict.items() if 'token_embedding_table' in _][0]
        num_heads = max([int(name.split('.sa.heads.')[1].split('.')[0]) for name, s in state_dict.items() if '.sa.heads.' in name]) + 1
        block_size = [s.shape[0] for _, s in state_dict.items() if 'position_embedding_table' in _][0]
        num_layers = max([int(name.split('blocks.')[1].split('.')[0]) for name, s in state_dict.items() if 'blocks.' in name]) + 1
        model = UMLGPT(vocab_size, embed_dim, block_size, num_layers, num_heads)
        model.load_state_dict(state_dict)
        return model
    
    


In [28]:
len(vocab_tokenizer), len(pretrained_lm_tokenizer)

(76838, 29008)

In [132]:
args.lr = 1e-3
args.batch_size = 32
args.epochs = 1
args.log_dir = 'logs'
args.models_dir = 'models'
args.from_pretrained = None
args.embed_dim = 128
args.block_size = 512
args.num_layers = 1
args.num_heads = 8

In [133]:
def get_uml_gpt(input_dim, args):
    embed_dim = args.embed_dim
    n_layer = args.num_layers
    n_head = args.num_heads
    block_size = args.block_size

    uml_gpt = UMLGPT(input_dim, embed_dim, block_size, n_layer, n_head)
    if args.from_pretrained is not None:
        uml_gpt.load_state_dict(torch.load(os.path.join(args.models_dir, args.from_pretrained)))
        print(f'Loaded pretrained model from {args.from_pretrained}')
    
    uml_gpt.to(device)
    return uml_gpt

In [134]:
args.from_pretrained = None

In [135]:
plm_uml_gpt = get_uml_gpt(len(pretrained_lm_tokenizer), args)
vocab_uml_gpt = get_uml_gpt(len(vocab_tokenizer), args)

In [136]:
def get_dataloaders(dataset, batch_size=32):
    dataloaders = {
        split_type: DataLoader(
            dataset[split_type], batch_size=batch_size, shuffle=split_type == 'train') for split_type in dataset
    }
    return dataloaders

In [137]:
from torch.utils.tensorboard import SummaryWriter


class UMLGPTTrainer:
    def __init__(self, model, dataloaders, args, compute_metrics_fn=None):
        self.model = model
        self.lr = args.lr
        self.batch_size = args.batch_size
        self.dataloaders = dataloaders
        self.args = args
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.lr)
        self.criterion = nn.CrossEntropyLoss(ignore_index=-100)
        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=args.num_epochs)
        self.writer = SummaryWriter(log_dir=args.log_dir)
        self.models_dir = args.models_dir
        self.model_str = self.model_str = f'{self.model._get_name()}_Vocab{args.trainer}'
        self.compute_metrics_fn = compute_metrics_fn
    
    def train(self, epochs):
        self.model.train()
                    
        for epoch in range(epochs):
            epoch_loss = 0
            best_test_loss = float('inf')
            epoch_metrics = {'loss': 0}
            for i, batch in tqdm(enumerate(self.dataloaders['train']), desc=f'Epoch {epoch}', total=len(self.dataloaders['train'])):
                loss, logits, labels = self.step(batch)
                epoch_loss += loss.item()

                epoch_metrics['loss'] += epoch_loss

                if self.compute_metrics_fn is not None:
                    metrics = self.compute_metrics_fn(logits, labels)
                    for metric in metrics:
                        if metric not in epoch_metrics:
                            epoch_metrics[metric] = 0
                        epoch_metrics[metric] += metrics[metric]

                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
                loss.backward()
                self.optimizer.step()
                # self.scheduler.step()
                self.optimizer.zero_grad()

                if i % 100 == 0:
                    print(f'Epoch {epoch} Batch {i} Avg Loss: {epoch_loss / (i + 1)}')

            self.scheduler.step()

            self.write_metrics(epoch_metrics, epoch, 'train')

            test_loss = self.evaluate(epoch, 'test')
            self.evaluate(epoch, 'unseen')

            if test_loss < best_test_loss:
                best_test_loss = test_loss
                self.save_model(f'{self.model_str}_best_model.pt')
                print(f'Best model saved at epoch {epoch}')
    
                
    def evaluate(self, epoch, split_type='test'):
        self.model.eval()
        eval_metrics = {'loss': 0}
        for batch in tqdm(self.dataloaders[split_type], desc=f'Evaluation'):
            loss, logits, labels = self.step(batch)

            if self.compute_metrics_fn is not None:
                metrics = self.compute_metrics_fn(logits, labels)
                for metric in metrics:
                    if metric not in eval_metrics:
                        eval_metrics[metric] = 0
                    eval_metrics[metric] += metrics[metric]

            eval_metrics['loss'] += loss.item()

        for metric in eval_metrics:
            if metric != 'loss':
                eval_metrics[metric] /= len(self.dataloaders[split_type])

        self.write_metrics(eval_metrics, epoch, split_type)
        return eval_metrics['loss']
    

    def step(self, batch):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        logits = self.model(input_ids, attention_mask)
        loss = self.model.get_loss(logits, labels)
        return loss, logits, labels


    def save_model(self, file_name):
        if not os.path.exists(self.models_dir):
            os.makedirs(self.models_dir)
        file_name = os.path.join(self.models_dir, file_name)
        torch.save(self.model.state_dict(), file_name)
        print(f'Saved model at {file_name}')
    
    def load_model(self, file_name):
        file_name = os.path.join(self.models_dir, file_name)
        self.model.load_state_dict(torch.load(file_name))
        print(f'Loaded model from {file_name}')
    
    
    def write_metrics(self, metrics, epoch, split_type):
        print(f'Epoch {epoch} {split_type} metrics: ', end='')
        for metric in metrics:
            self.writer.add_scalar(f'Metrics/{split_type}{metric}', metrics[metric], epoch)
            print(f'{metric}: {metrics[metric]:.3f}', end=' ')
        print()



In [138]:
args.trainer = 'PT'
lm_trainer = UMLGPTTrainer(plm_uml_gpt, get_dataloaders(lm_dataset), args)
args.trainer = 'CT'
vocab_trainer = UMLGPTTrainer(vocab_uml_gpt, get_dataloaders(vocab_dataset), args)

In [139]:
plm_uml_gpt

UMLGPT(
  (token_embedding_table): Embedding(29008, 128)
  (position_embedding_table): Embedding(512, 128)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-7): 8 x Head(
            (key): Linear(in_features=128, out_features=16, bias=False)
            (query): Linear(in_features=128, out_features=16, bias=False)
            (value): Linear(in_features=128, out_features=16, bias=False)
            (softmax): Softmax(dim=-1)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (proj): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ffwd): FeedFoward(
        (net): Sequential(
          (0): Linear(in_features=128, out_features=512, bias=True)
          (1): ReLU()
          (2): Linear(in_features=512, out_features=128, bias=True)
          (3): Dropout(p=0.1, inplace=False)
        )
      )
      (ln1): LayerNorm((128,), 

In [140]:
lm_trainer.train(args.epochs)
vocab_trainer.train(args.epochs)

Epoch 0:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 0 Batch 0 Avg Loss: 10.279776573181152
Epoch 0 train metrics: loss: 101.950 


Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 0 test metrics: loss: 39.544 


Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 0 unseen metrics: loss: 39.641 
Saved model at models/UMLGPT_VocabPT_best_model.pt
Best model saved at epoch 0


Epoch 0:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 0 Batch 0 Avg Loss: 11.228178977966309
Epoch 0 train metrics: loss: 111.063 


Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 0 test metrics: loss: 42.913 


Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 0 unseen metrics: loss: 42.931 
Saved model at models/UMLGPT_VocabCT_best_model.pt
Best model saved at epoch 0


In [36]:
class GPT2Dataset(Dataset):
    def __init__(self, tokenized):
        self.tokenized = tokenized
    
    def __len__(self):
        return len(self.tokenized['input_ids'])
    
    def __getitem__(self, index):
        item = {key: val[index] for key, val in self.tokenized.items()}
        return item
    
def get_gpt2_tokenized_data(data, tokenizer):
    tokenized_data = {
        split_type: tokenizer(
            data[split_type], 
            padding=True, 
            return_tensors='pt', 
            max_length=get_encoding_size(data[split_type], tokenizer), 
            truncation=True
        ) for split_type in data
    }
    return tokenized_data

def get_gpt2_dataset(data, tokenizer):
    tokenized_data = get_gpt2_tokenized_data(data, tokenizer)
    dataset = {
        split_type: GPT2Dataset(tokenized_data[split_type]) for split_type in data
    }
    return dataset

In [141]:
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling

from transformers.integrations import NeptuneCallback

def suppress_neptune(trainer):
    for cb in trainer.callback_handler.callbacks:
        if isinstance(cb, NeptuneCallback):
            trainer.callback_handler.remove_callback(cb)


def train_hugging_face_gpt(data, args):
    model_name = args.gpt_model
    tokenizer = get_pretrained_lm_tokenizer(model_name, special_tokens=args.special_tokens)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model.resize_token_embeddings(len(tokenizer))
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
    model.config.pad_token_id = model.config.eos_token_id

    print('Creating dataset...')
    dataset = get_gpt2_dataset(data, tokenizer)
    print('Done!')

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # Set to True if you want to perform masked language modeling
    )


    training_args = TrainingArguments(
        output_dir=args.log_dir,          # output directory
        num_train_epochs=args.num_epochs,              # total number of training epochs
        per_device_train_batch_size=args.batch_size,   # batch size per device during training
        per_device_eval_batch_size=args.batch_size,    # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir=args.log_dir,            # directory for storing logs
        logging_steps=10,
        save_steps=1000,
        save_total_limit=1,
        evaluation_strategy='steps',
        eval_steps=100,
        lr_scheduler_type="cosine",
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        fp16=True,
        greater_is_better=False
    )

    trainer = Trainer(
        model=model,                         # the instantiated Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=dataset['train'],         # training dataset
        eval_dataset=dataset['test'],          # evaluation dataset
        data_collator=data_collator,
    )

    suppress_neptune(trainer)

    trainer.train()

    print('Evaluating on test set...')
    trainer.evaluate(dataset['test'])

    print('Evaluating on unseen set...')
    trainer.evaluate(dataset['unseen'])

    trainer.save_model(os.path.join(args.log_dir, f'uml_{model_name}'))


    print('Done!')

<h3> Creating Sequence Classification Models </h3>

In [187]:
class UMLGPTClassifier(nn.Module):

    def __init__(self, model, num_classes):
        super().__init__()
        
        self.model = model
        _, embed_dim = self.model.lm_head.weight.data.shape
        self.classifier = FeedFoward(input_dim=embed_dim, num_classes=num_classes)
        self.apply(weights_init)

    def forward(self, x, attention_mask, pool=None):
        # x: [batch_size, seq_len]
        # attention_mask: [batch_size, seq_len]
        lm_logits = self.model.get_embedding(x, attention_mask)
        if pool:
            """Pool the logits across the sequence dimension"""
            lm_logits = torch.mean(lm_logits, dim=1)
        else:
            """Use the logits at the last position"""
            lm_logits = lm_logits[:, -1, :]
        
        logits = self.classifier(lm_logits)
        return logits
    
    def get_loss(self, logits, labels):
        logits = logits.to(device)
        labels = labels.to(device)

        if len(labels.shape) == 1:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)
        else:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.float(), labels.float())
        return loss

    def get_model_size(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)
    
    def __repr__(self):
        return super().__repr__() + f'\nNumber of parameters: {self.get_model_size()/1000000:.3f}M'
    
    @staticmethod
    def from_pretrained(state_dict, num_classes):
        model = UMLGPTClassifier(UMLGPT.from_pretrained(state_dict), num_classes)
        return model

In [144]:
uml_gpt_vocab_entity_classifier = UMLGPTClassifier(vocab_uml_gpt, len(entity_map))
uml_gpt_vocab_super_type_classifier = UMLGPTClassifier(vocab_uml_gpt, len(super_types_map))

In [145]:
uml_gpt_plm_entity_classifier = UMLGPTClassifier(plm_uml_gpt, len(entity_map))
uml_gpt_plm_super_type_classifier = UMLGPTClassifier(plm_uml_gpt, len(super_types_map))

In [57]:
uml_gpt_plm_entity_classifier._get_name()

'UMLGPTClassifier'

In [58]:
vocab_super_classification_dataset['train'][0]

{'input_ids': tensor([ 2,  8, 14,  9, 10, 11,  3,  0,  0,  0,  0,  0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]),
 'labels': tensor(10806)}

In [103]:
from data_utils import get_recommendation_metrics, get_recommendation_metrics_multi_label

In [150]:
def train_uml_gpt_classification(data, encoder, tokenizer, compute_metrics_fn, args):
    data = get_data_for_classification(data, class_type=args.class_type)
    dataset = get_classification_dataset(data, tokenizer, encoder, class_type=args.class_type, multi_label=args.multi_label)
    model = get_uml_gpt(len(tokenizer), args)
    uml_gpt_classifier = UMLGPTClassifier(model, len(encoder))
    uml_gpt_trainer = UMLGPTTrainer(uml_gpt_classifier, get_dataloaders(dataset), args, compute_metrics_fn=compute_metrics_fn)
    uml_gpt_trainer.train(args.epochs)

In [None]:
args.multi_label = False
args.class_type = 'entity'

args.from_pretrained = 'UMLGPT_VocabCT_best_model.pt'
train_uml_gpt_classification(data, entity_map, vocab_tokenizer, get_recommendation_metrics, args)

print('-'*100)

args.from_pretrained = 'UMLGPT_VocabPT_best_model.pt'
train_uml_gpt_classification(data, entity_map, pretrained_lm_tokenizer, get_recommendation_metrics, args)

print('-'*100)
print('-'*100)
print('-'*100)

args.class_type = 'super'

args.from_pretrained = 'UMLGPT_VocabCT_best_model.pt'
train_uml_gpt_classification(data, super_types_map, vocab_tokenizer, get_recommendation_metrics, args)

print('-'*100)

args.from_pretrained = 'UMLGPT_VocabPT_best_model.pt'
train_uml_gpt_classification(data, super_types_map, pretrained_lm_tokenizer, get_recommendation_metrics, args)

print('-'*100)
print('-'*100)
print('-'*100)


args.multi_label = True
args.from_pretrained = 'UMLGPT_VocabCT_best_model.pt'
train_uml_gpt_classification(data, super_types_map, vocab_tokenizer, get_recommendation_metrics_multi_label, args)

print('-'*100)

args.from_pretrained = 'UMLGPT_VocabPT_best_model.pt'
train_uml_gpt_classification(data, super_types_map, pretrained_lm_tokenizer, get_recommendation_metrics_multi_label, args)

print('-'*100)
print('-'*100)

In [None]:
import transformers
import utils

def train_hf_for_classification(model_name, tokenizer, dataset, args):
    batch_size = args.lm_batch_size
    train, test, unseen = dataset['train'], dataset['test'], dataset['unseen']
    # Show the training loss with every epoch
    logging_steps = len(train) // batch_size
    print(f"Using model...{model_name}")
    model = utils.get_classification_model(model_name, len(dataset.num_labels), tokenizer)
    model.resize_token_embeddings(len(tokenizer))
    print("Finetuning model...")
    training_args = TrainingArguments(
        output_dir=args.out_dir,
        overwrite_output_dir=True,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_steps=args.warmup_steps,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        fp16=True,
        logging_steps=logging_steps,
        num_train_epochs=args.num_epochs,
        save_total_limit=2,
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train,
        eval_dataset=test,
        tokenizer=tokenizer,
        compute_metrics=utils.compute_metrics,
    )
    for cb in trainer.callback_handler.callbacks:
        if isinstance(cb, transformers.integrations.NeptuneCallback):
            trainer.callback_handler.remove_callback(cb)

In [153]:
state_dict = torch.load('models/UMLGPT_VocabPT_best_model.pt')

In [155]:
state_dict.keys()

odict_keys(['token_embedding_table.weight', 'position_embedding_table.weight', 'blocks.0.sa.heads.0.tril', 'blocks.0.sa.heads.0.key.weight', 'blocks.0.sa.heads.0.query.weight', 'blocks.0.sa.heads.0.value.weight', 'blocks.0.sa.heads.1.tril', 'blocks.0.sa.heads.1.key.weight', 'blocks.0.sa.heads.1.query.weight', 'blocks.0.sa.heads.1.value.weight', 'blocks.0.sa.heads.2.tril', 'blocks.0.sa.heads.2.key.weight', 'blocks.0.sa.heads.2.query.weight', 'blocks.0.sa.heads.2.value.weight', 'blocks.0.sa.heads.3.tril', 'blocks.0.sa.heads.3.key.weight', 'blocks.0.sa.heads.3.query.weight', 'blocks.0.sa.heads.3.value.weight', 'blocks.0.sa.heads.4.tril', 'blocks.0.sa.heads.4.key.weight', 'blocks.0.sa.heads.4.query.weight', 'blocks.0.sa.heads.4.value.weight', 'blocks.0.sa.heads.5.tril', 'blocks.0.sa.heads.5.key.weight', 'blocks.0.sa.heads.5.query.weight', 'blocks.0.sa.heads.5.value.weight', 'blocks.0.sa.heads.6.tril', 'blocks.0.sa.heads.6.key.weight', 'blocks.0.sa.heads.6.query.weight', 'blocks.0.sa.heads.

In [17]:
"""
Given X, y pairs
Create TF-IDF Vectorizor for the data and then use SVM to classify the data
"""
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import classification_report


def get_tfidf_vectorizer(data):
    vectorizer = TfidfVectorizer()
    vectorizer.fit(data)
    return vectorizer


def get_Xy(data, split):
    X = [j[0] for j in data[split]]
    y = [j[1] for j in data[split]]
    
    return X, y

def get_svm_classifier(X, y):
    clf = SVC()
    clf.fit(X, y)
    return clf

def get_svm_metrics(clf, X, y):
    X, y = get_Xy(data)
    y_pred = clf.predict(X)
    print(classification_report(y, y_pred))
    print(f'F1 Score: {f1_score(y, y_pred, average="macro")}')
    print(f'Precision Score: {precision_score(y, y_pred, average="macro")}')
    print(f'Recall Score: {recall_score(y, y_pred, average="macro")}')
    

In [39]:
from sklearn.calibration import LabelEncoder


Xy = [(j[0], k) for j in data_for_super_type_classification['train'] for k in j[1]] \
        + [(j[0], k) for j in data_for_super_type_classification['test'] for k in j[1]] \
            + [(j[0], k) for j in data_for_super_type_classification['unseen'] for k in j[1]]

('<s> <entity> CFlowPointCut </entity> <relations>  </relations> </s>',
 'PointCut')

In [40]:
tf_idf_vectorizer = get_tfidf_vectorizer([i[0] for i in Xy])
X = tf_idf_vectorizer.transform([i[0] for i in Xy])

y = LabelEncoder().fit_transform([i[1] for i in Xy])

In [53]:
i2c = {k: v for v, k in zip(y, [i[1] for i in Xy])}

In [55]:
Xy_train = [(j[0], k) for j in data_for_super_type_classification['train'] for k in j[1]]
Xy_test = [(j[0], k) for j in data_for_super_type_classification['test'] for k in j[1]]
Xy_unseen = [(j[0], k) for j in data_for_super_type_classification['unseen'] for k in j[1]]

X_train = tf_idf_vectorizer.transform([i[0] for i in Xy_train])
X_test = tf_idf_vectorizer.transform([i[0] for i in Xy_test])
X_unseen = tf_idf_vectorizer.transform([i[0] for i in Xy_unseen])

y_train = np.array([i2c[i[1]] for i in Xy_train])
y_test = np.array([i2c[i[1]] for i in Xy_test])
y_unseen = np.array([i2c[i[1]] for i in Xy_unseen])

In [58]:
clf = SVC()
clf.fit(X_train, y_train)