<a href="https://colab.research.google.com/github/hepbc/NLP/blob/master/BERT_training_with_cos_v1_CUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
%pip install transformers

In [None]:
from transformers import BertTokenizer

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#tokenizer = AutoTokenizer.from_pretrained("ipuneetrathore/bert-base-cased-finetuned-finBERT")

In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
lines = pd.read_csv(r"Bert example tags expanded with cos.csv", header=0)
lines = lines.sample(frac=1)
lines
txt = lines["Text"].tolist()
text = []
for line in txt:
    text.append(line.lower())
text

In [None]:
tags = []
slot_map = {}
slot_names = ["[PAD]"]
slot_map["[PAD]"] = 0 
for i in range(len(lines)):
    k = ""
    for j in range(11):
#        print(i,j)
        slot = lines.iloc[i,j+2]
#        print(slot)
        if pd.isna(slot):
            break
        else:
            if slot not in slot_names:
                slot_names.append(slot)
                slot_map[slot] = len(slot_map)
            k = k + " " + str(slot)
    tags.append(k)
print(slot_names)
print(slot_map)

In [None]:
import pickle

In [None]:
with open("slot_and_intent_data", "wb") as file:
    pickle.dump(slot_names, file)
    pickle.dump(slot_map, file)
    #pickle.dump(intent_names, file)
    #pickle.dump(intent_map, file)
file.close()

In [None]:
def encode_token_labels(text_sequences, slot_names, tokenizer, slot_map,
                        max_length):
    encoded = np.zeros(shape=(len(text_sequences), max_length), dtype=np.int32)
    for i, (text_sequence, word_labels) in enumerate(
            zip(text_sequences, slot_names)):
        encoded_labels = []
        print(text_sequence, word_labels)
        for word, word_label in zip(text_sequence.split(), word_labels.split()):
            #print(word, word_label)
            tokens = tokenizer.tokenize(word)
            #print(tokens)
            encoded_labels.append(slot_map[word_label])
            #print(encoded_labels)
            expand_label = word_label.replace("B-", "I-")
            if not expand_label in slot_map:
                expand_label = word_label
            encoded_labels.extend([slot_map[expand_label]] * (len(tokens) - 1))
        encoded[i, 1:len(encoded_labels) + 1] = encoded_labels
        print(tokenizer.tokenize(text_sequence))
        print(encoded_labels)
    return encoded


#slot_train = encode_token_labels(text, tags, tokenizer, slot_map, 25)
#slot_train = encode_token_labels(text[:75], tags, tokenizer, slot_map, 25)
#slot_test = encode_token_labels(text[75:], tags, tokenizer, slot_map, 25)
slot_ids = encode_token_labels(text, tags, tokenizer, slot_map, 25)

In [None]:
def encode_dataset(tokenizer, text_sequences, max_length):
    token_ids = np.zeros(shape=(len(text_sequences), max_length),
                         dtype=np.int32)
    for i, text_sequence in enumerate(text_sequences):
        print(text_sequence)
        encoded = tokenizer.encode(text_sequence)
        #print(len(encoded))
        token_ids[i, 0:len(encoded)] = encoded
    attention_masks = (token_ids != 0).astype(np.int32)
    #return {"input_ids": token_ids, "attention_masks": attention_masks}
    return(token_ids, attention_masks)

# encoded_train = encode_dataset(tokenizer, text[:75], 25)
# encoded_test = encode_dataset(tokenizer, text[75:], 25)
inputs_ids, attention_masks = encode_dataset(tokenizer, text, 25)

In [None]:
from tqdm import tqdm, trange

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
#from transformers import BertTokenizer, BertConfig

#from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

torch.__version__

MAX_LEN = 25
bs = 32


In [None]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(inputs_ids, slot_ids,
                                                            random_state=2020, test_size=0.05)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, inputs_ids,
                                             random_state=2020, test_size=0.05)

tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [None]:
print(text[93])
print(tokenizer.tokenize(text[93]))
print(tags[93])
print(slot_ids[93])
print(inputs_ids[93])
print(tokenizer.encode(tokenizer.tokenize(text[93])))
# print(tr_tags[93])
print(slot_map)

In [None]:
print(len(tr_inputs), len(tr_tags), len(tr_masks))
print(len(train_data), len(train_sampler), len(train_dataloader))

print(len(val_inputs), len(val_tags), len(val_masks))
print(len(valid_data), len(valid_sampler), len(valid_dataloader))

In [None]:
%pip install seqeval

In [None]:
from seqeval.metrics import f1_score, accuracy_score

In [None]:
import transformers
from transformers import BertForTokenClassification, AdamW

print(transformers.__version__)

model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(slot_map),
    output_attentions = False,
    output_hidden_states = False
)

FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=9e-5, #3e-5
    eps=1e-8
)

from transformers import get_linear_schedule_with_warmup

epochs = 25
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
tag_values = slot_names
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(device, n_gpu)
torch.cuda.get_device_name(0)

In [None]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

for _ in trange(epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        print(step)
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        #batch = tuple(t for t in batch)
        #print(batch)
        b_input_ids, b_input_mask, b_labels = batch
        #print(type(b_input_ids))
        ###############Bug fix code####################
        b_input_ids = b_input_ids.type(torch.LongTensor)
        b_input_mask = b_input_mask.type(torch.LongTensor)
        b_labels = b_labels.type(torch.LongTensor)
        #print(type(b_input_ids))
        b_input_ids = b_input_ids.to(device)
        b_input_mask = b_input_mask.to(device)
        b_labels = b_labels.to(device)
#          ############################################

        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        optimizer.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()
        print("done")

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        
        ###############Bug fix code####################
        b_input_ids = b_input_ids.type(torch.LongTensor)
        b_input_mask = b_input_mask.type(torch.LongTensor)
        b_labels = b_labels.type(torch.LongTensor)
        print(type(b_input_ids))#, b_input_mask, b_labels))
        b_input_ids = b_input_ids.to(device)
        b_input_mask = b_input_mask.to(device)
        b_labels = b_labels.to(device)
        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            print(type(outputs))
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
    print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print()


In [None]:
print("\n * input_ids_tensor \n ")
print(b_input_ids)
print(b_input_ids.device)

print("\n * segment_ids_tensor \n ")
print(b_input_mask)
print(b_input_mask.device)

print("\n * input_mask_tensor \n ")
print(b_labels)
print(b_labels.device)

print("\n * self.device \n ")
print(self.device)


In [None]:
#Saving model
torch.save(model.state_dict(), "NERwithTCv4.pt")

In [None]:
#Loading model
model1 = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(slot_map),
    output_attentions = False,
    output_hidden_states = False
)

model1.load_state_dict(torch.load("NERwithTC.pt"))
model1.eval()


In [None]:
def show_predictions(text, tokenizer, model, slot_names):
    inputs = torch.tensor(tokenizer.encode(text))[None, :]  # batch_size = 1
    inputs = inputs.type(torch.LongTensor)
    #    b_input_mask = b_input_mask.type(torch.LongTensor)
    #    b_labels = b_labels.type(torch.LongTensor)
        #print(type(b_input_ids))
    inputs = inputs.to(device)

    print(inputs)
    with torch.no_grad():
        outputs = model(inputs)
    #print(type(outputs))
    #print(outputs)
    slot_logits = outputs[0].detach().cpu().numpy()
    #label_ids = b_labels.to('cpu').numpy()
    #slot_logits = outputs[0]
    #print(np.numpy(slot_logits
    slot_ids = slot_logits.argmax(axis=-1)[0, 1:-1]
    #print(slot_ids)
    #intent_id = intent_logits.numpy().argmax(axis=-1)[0]
    #print("## Intent:", intent_id)#intent_names[intent_id])
    print("## Slots:")
    for token, slot_id in zip(tokenizer.tokenize(text), slot_ids):
        print(f"{token:>10} : {slot_names[slot_id]}")


In [None]:
usertxt = ""
while usertxt != "Exit":
    usertxt = input()
    show_predictions(usertxt, tokenizer, model, slot_names)


In [None]:
def decode_predictions1(text, tokenizer, slot_names, slot_ids):
#info = {"intent": intent_names[intent_id]}
    info = {}
    #info["intent"] = list(intent_names)[intent_id]
    collected_slots = {}
    active_slot_words = []
    active_slot_name = None
    prev_word_slot_name = ""
    prev_slot_pos = ""
    prev_slot_type = ""
    words = text.split()
    for i, word in enumerate(words):
        print(word)
        tokens = tokenizer.tokenize(word)
        current_word_slot_ids = slot_ids[:len(tokens)]
        slot_ids = slot_ids[len(tokens):]
        print(current_word_slot_ids, slot_ids)
        current_word_slot_name = slot_names[current_word_slot_ids[0]]
        curr_slot_pos = current_word_slot_name[0]
        curr_slot_type = current_word_slot_name[2:]
        print(current_word_slot_name, curr_slot_type, prev_word_slot_name, prev_slot_type)
        if len(prev_word_slot_name) != 0:
            prev_slot_pos = prev_word_slot_name[0]
            prev_slot_type = prev_word_slot_name[2:]            
        # if I-series continuing add to slot words
        if current_word_slot_name == prev_word_slot_name:
            active_slot_words.append(word)
        # if I-series after B-series of same slot type, add to slot words
        elif curr_slot_pos == "I" and prev_slot_pos =="B" and curr_slot_type == prev_slot_type:
            active_slot_words.append(word)
        # for all else, add previously collected slot to list
        else:
            if prev_slot_type in collected_slots:
                #print(prev_slot_type)
                slot_name = " ".join(active_slot_words)
                collected_slots[prev_slot_type].append(slot_name)
                #all fundamentals to have an attribute or NA
                if prev_slot_type == "funda":
                    #check if attr came before funda
#                     if curr_slot_type != "attr":
#                         if "attr" in collected_slots:
#                             collected_slots["attr"].append("NA")
#                         else:
#                             collected_slots["attr"] = []
#                             collected_slots["attr"].append("NA")
                    if "attr" in collected_slots:
                        if len(collected_slots["funda"]) != len(collected_slots["attr"]):
                            collected_slots["attr"].append("NA")
                    else:
                        collected_slots["attr"] = []
                        if len(collected_slots["funda"]) != len(collected_slots["attr"]):
                            collected_slots["attr"].append("NA")
                        else:
                            print("Each fundamental should have only one attribute.")
            else: 
                collected_slots[prev_slot_type] = []
                slot_name = " ".join(active_slot_words)
                collected_slots[prev_slot_type].append(slot_name)
                #all fundamentals to have an attribute or NA
                if prev_slot_type == "funda":
#                     if curr_slot_type != "attr":
#                         if "attr" in collected_slots:
#                             collected_slots["attr"].append("NA")
#                         else:
#                             collected_slots["attr"] = []
#                             collected_slots["attr"].append("NA")
                    if "attr" in collected_slots:
                        if len(collected_slots["funda"]) != len(collected_slots["attr"]):
                            collected_slots["attr"].append("NA")
                    else:
                        collected_slots["attr"] = []
                        if len(collected_slots["funda"]) != len(collected_slots["attr"]):
                            collected_slots["attr"].append("NA")
                        else:
                            print("Each fundamental should have only one attribute.")
                                
            active_slot_words = []
            active_slot_words.append(word)
        print("Active: ", active_slot_words)    
        prev_word_slot_name = current_word_slot_name
    # handling last word
    if curr_slot_type in collected_slots:
        slot_name = " ".join(active_slot_words)
        collected_slots[curr_slot_type].append(slot_name)
    else:
        collected_slots[curr_slot_type] = []
        slot_name = " ".join(active_slot_words)
        collected_slots[curr_slot_type].append(slot_name)
    #adding NA in attributes if last word in fundamental
    if curr_slot_type == "funda":
        if "attr" in collected_slots:
#             collected_slots["attr"].append("NA")
#         else:
#             collected_slots["attr"] = []
#             collected_slots["attr"].append("NA")
            if len(collected_slots["funda"]) != len(collected_slots["attr"]):
                collected_slots["attr"].append("NA")
            else:
                collected_slots["attr"] = []
                if len(collected_slots["funda"]) != len(collected_slots["attr"]):
            #collected_slots["attr"] = []
                    collected_slots["attr"].append("NA")
                else:
                    print("Each fundamental should have only one attribute.")                            
    info["slots"] = collected_slots
    return info


#def nlu(text, tokenizer, model, intent_names, slot_names):
def nlu(text, tokenizer, model, slot_names):
    #inputs = tf.constant(tokenizer.encode(text))[None, :]  # batch_size = 1
    inputs = torch.tensor(tokenizer.encode(text))[None, :]  # batch_size = 1
    print(inputs)
    with torch.no_grad():
        outputs = model(inputs)
    #print(outputs)
    slot_logits = outputs[0]
    #print(slot_logits)
    slot_ids = slot_logits.numpy().argmax(axis=-1)[0, 1:-1]
    print(slot_ids)
    #intent_id = intent_logits.numpy().argmax(axis=-1)[0]

    #return decode_predictions1(text, tokenizer, intent_names, slot_names, intent_id, slot_ids)
    return decode_predictions1(text, tokenizer, slot_names, slot_ids)



In [None]:
usertxt = ""
while usertxt != "exit":
    usertxt = input().lower()
    #print(nlu(usertxt, tokenizer, model1, intent_names, slot_names))
    print(nlu(usertxt, tokenizer, model, slot_names))


In [None]:
words