# NLP Experiment 14 May
Bert QA Referenced from https://colab.research.google.com/drive/14_iltRAkpPRpuajTMqm8QI1OkceUbDk_

In [1]:
!nvidia-smi

Tue May 14 05:33:36 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Install Dependencies

In [2]:
! pip install torch datasets accelerate

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-c

## Data Preprocessing

In [3]:
import torch
import pandas as pd
import json

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
file_path = "/content/nlp.jsonl"
df = pd.read_json(path_or_buf=file_path, lines=True)
df

Unnamed: 0,key,transcript,tool,heading,target
0,0,"Turret, prepare to deploy electromagnetic puls...",electromagnetic pulse,65,grey and white fighter jet
1,1,Engage yellow drone with surface-to-air missil...,surface-to-air missiles,235,yellow drone
2,2,"Control to turrets, deploy electromagnetic pul...",electromagnetic pulse,110,blue and red fighter plane
3,3,"Alfa, Echo, Mike Papa, deploy EMP tool heading...",EMP,85,"purple, red, and silver fighter jet"
4,4,"Engage the grey, black, and green fighter plan...",machine gun,95,"grey, black, and green fighter plane"
...,...,...,...,...,...
3495,3495,Deploy electromagnetic pulse on brown commerci...,electromagnetic pulse,350,brown commercial aircraft
3496,3496,"Deploy surface-to-air missiles, heading two on...",surface-to-air missiles,215,"silver, orange, and brown helicopter"
3497,3497,"Engage target, grey, orange, and silver missil...",surface-to-air missiles,80,"grey, orange, and silver missile"
3498,3498,Engage the white drone at heading zero five fi...,machine gun,55,white drone


In [6]:
import re

def sent_tokenize(transcript):
    return re.split('[.,]', transcript)

def word_tokenize(sent):
    return sent.split()

def strict_word_tokenize(sent):
    #return [x for x in re.split('[., ]', sent) if x != '']
    words = [m.group(0) for m in re.finditer(r'[\w]+', sent)]
    indexes = [(m.start(), m.end()) for m in re.finditer(r'[\w]+', sent)]
    return words, indexes

In [7]:
strict_word_tokenize("Hello world, yay, nay.")

(['Hello', 'world', 'yay', 'nay'], [(0, 5), (6, 11), (13, 16), (18, 21)])

## Heading

In [8]:
digits = ['zero','one','two','three','four','five','six','seven','eight','nine']

word2digit = {word:i for i,word in enumerate(digits)}
word2digit['niner'] = 9

def words2digit(words):
    '''
    Converts heading in words to digits.
    words:list(string)
    return:int
    '''
    return 100*word2digit[words[0]] + 10*word2digit[words[1]] + word2digit[words[2]]


def findHeading(sent):
    '''
    Preliminary idea:
    1. If can use hardcode, just inference using hardcode to save time
    2. Some preprocessing of data then try again
    3. Last resort: stuff into LLM (may only pick relavant parts to stuff into LLM to save time)
    '''
    words, indexes = strict_word_tokenize(sent.lower())

    return sliding_window_heading(words, indexes)


def sliding_window_heading(words,indexes):
    for i in range(len(words)-2):
        try:
            return indexes[i][0], words2digit(words[i:i+3])
        except:
            pass
    return -1 #TODO: Use BERT QA

def applyHeading():
    heading_pred = []
    heading_start = []
    for i,row in df.iterrows():
        idx, heading = findHeading(row['transcript'])
        heading_pred.append(heading)
        heading_start.append(idx)
    df['heading_pred'] = heading_pred
    df['heading_start'] = heading_start

In [9]:
applyHeading()

In [10]:
sum(df['heading'] == df['heading_pred'])/len(df)

1.0

In [11]:
findHeading(df['transcript'][399]),df['transcript'][399]

((15, 180), 'Set heading to One Eight Zero. Deploy EMP on brown missile.')

In [12]:
df

Unnamed: 0,key,transcript,tool,heading,target,heading_pred,heading_start
0,0,"Turret, prepare to deploy electromagnetic puls...",electromagnetic pulse,65,grey and white fighter jet,65,57
1,1,Engage yellow drone with surface-to-air missil...,surface-to-air missiles,235,yellow drone,235,58
2,2,"Control to turrets, deploy electromagnetic pul...",electromagnetic pulse,110,blue and red fighter plane,110,58
3,3,"Alfa, Echo, Mike Papa, deploy EMP tool heading...",EMP,85,"purple, red, and silver fighter jet",85,47
4,4,"Engage the grey, black, and green fighter plan...",machine gun,95,"grey, black, and green fighter plane",95,78
...,...,...,...,...,...,...,...
3495,3495,Deploy electromagnetic pulse on brown commerci...,electromagnetic pulse,350,brown commercial aircraft,350,69
3496,3496,"Deploy surface-to-air missiles, heading two on...",surface-to-air missiles,215,"silver, orange, and brown helicopter",215,40
3497,3497,"Engage target, grey, orange, and silver missil...",surface-to-air missiles,80,"grey, orange, and silver missile",80,87
3498,3498,Engage the white drone at heading zero five fi...,machine gun,55,white drone,55,34


## Tool

In [13]:
tools = df.tool.unique()
tools

array(['electromagnetic pulse', 'surface-to-air missiles', 'EMP',
       'machine gun', 'anti-air artillery', 'interceptor jets',
       'drone catcher'], dtype=object)

In [14]:
def findTool(sent):
    '''
    Use sliding window to find tool
    '''
    for idx in range(len(sent)):
        for tool in tools:
            if idx + len(tool) <= len(sent) and tool.lower() == sent[idx:idx+len(tool)].lower():
                return idx, tool
    return "None"

def applyTool():
    tool_pred = []
    tool_start = []
    for i,row in df.iterrows():
        idx, tool = findTool(row['transcript'])
        tool_pred.append(tool)
        tool_start.append(idx)
    df['tool_pred'] = tool_pred
    df['tool_start'] = tool_start

In [15]:
applyTool()

In [16]:
sum(df['tool'] == df['tool_pred'])/len(df)

0.9997142857142857

In [17]:
df[df['tool_pred'] != df['tool']]

Unnamed: 0,key,transcript,tool,heading,target,heading_pred,heading_start,tool_pred,tool_start
2739,2739,"Control here, deploy an electromagnetic pulse ...",EMP,345,green fighter jet,345,70,electromagnetic pulse,24


In [18]:
df['transcript'][2739], findTool(df['transcript'][2739])

('Control here, deploy an electromagnetic pulse (EMP) in the heading of three four five to neutralize the green fighter jet. Over.',
 (24, 'electromagnetic pulse'))

## Target

In [19]:
df['target'][550:600]

550                      blue, grey, and white drone
551                       yellow commercial aircraft
552       brown, orange, and red commercial aircraft
553                              blue cargo aircraft
554                                brown fighter jet
555                         black camouflage missile
556                   brown, white, and yellow drone
557                                        red drone
558                   grey and purple cargo aircraft
559                  brown, purple, and yellow drone
560             brown, white, and red light aircraft
561                        black commercial aircraft
562                        green commercial aircraft
563       purple and black camouflage cargo aircraft
564                             white light aircraft
565                     black and yellow fighter jet
566    purple, white, and orange commercial aircraft
567                       orange commercial aircraft
568                                 grey fight

In [20]:
def extract_target_features(dataframe):
    all_tokens = [strict_word_tokenize(s)[0] for s in dataframe['target']]
    targets = set([tokens[-1] for tokens in all_tokens])
    attributes = set(sum([[token for token in tokens[:-1] if token != 'and'] for tokens in all_tokens],[])) # "sum" flattens the list
    return targets, attributes

In [21]:
targets, attributes = extract_target_features(df)

targets, attributes

({'aircraft', 'drone', 'helicopter', 'jet', 'missile', 'plane'},
 {'black',
  'blue',
  'brown',
  'camouflage',
  'cargo',
  'commercial',
  'fighter',
  'green',
  'grey',
  'light',
  'orange',
  'purple',
  'red',
  'silver',
  'white',
  'yellow'})

In [22]:
def findTarget(sent):
    sent = sent.lower()
    tokens, indexes = strict_word_tokenize(sent)
    st, en = -1,-1
    for token, idx in zip(tokens, indexes):
        if token in attributes and st == -1:
            st = idx[0]
        elif (token in targets or token[:-1] in targets) and st != -1:
            en = idx[1]
            break

    if sent[en-1] == 's':
        en -= 1

    return st, sent[st:en]

def applyFindTarget():
    target_pred = []
    target_start = []
    for i,row in df.iterrows():
        st, target = findTarget(row['transcript'])
        target_pred.append(target)
        target_start.append(st)
    df['target_pred'] = target_pred
    df['target_start'] = target_start

In [23]:
df['transcript'][3171],findTarget(df['transcript'][3171])

('Turret, redirect to heading one eight five. Engage target, red and black fighter jet. Deploy interceptor jets. Commence strike sequence.',
 (59, 'red and black fighter jet'))

In [24]:
applyFindTarget()

sum(df['target']==df['target_pred'])/len(df)

1.0

## Target (Bert QA)

For last resort if previous methods fail

In [25]:
!rm -r /content/bert-base-uncased

rm: cannot remove '/content/bert-base-uncased': No such file or directory


In [26]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, DefaultDataCollator, TrainingArguments, Trainer
import torch

model_name = "deepset/roberta-base-squad2"
print(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

deepset/roberta-base-squad2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [27]:
context = df['transcript'][0]
question = "What is the tool to deploy?"

In [28]:
inputs = tokenizer(question, context, return_tensors='pt')
with torch.no_grad():
    outputs = model(**inputs)

# Find the tokens with the highest `start` and `end` scores
answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits) + 1

print(answer_start, answer_end)

# Convert tokens to answer string
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs.input_ids[0, answer_start:answer_end]))
print("Answer:", answer)

tensor(16) tensor(18)
Answer:  electromagnetic pulse


In [29]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(df,test_size=0.2)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

train_df.head()

Unnamed: 0,key,transcript,tool,heading,target,heading_pred,heading_start,tool_pred,tool_start,target_pred,target_start
0,3100,Control here. Prepare to deploy EMP on purple ...,EMP,315,purple missile,315,65,EMP,32,purple missile,39
1,1058,"Turret, engage. Heading zero three five, targe...",anti-air artillery,35,green cargo aircraft,35,24,anti-air artillery,77,green cargo aircraft,48
2,238,"Control tower to air defense turrets, engage s...",surface-to-air missiles,220,grey missile,220,93,surface-to-air missiles,45,grey missile,72
3,1273,"Control to turrets, be advised, deploy machine...",machine gun,230,"green, grey, and blue drone",230,104,machine gun,39,"green, grey, and blue drone",65
4,1187,Engage the grey and blue helicopter heading tw...,anti-air artillery,285,grey and blue helicopter,285,44,anti-air artillery,64,grey and blue helicopter,11


In [30]:
def generate_dataset(dataframe):
    data = {
        'id': [],
        "context": [],
        'question': [],
        'answers': []
    }
    questions = [
        "What is the heading?",
        "What is the tool to deploy?",
        "What is the target to engage?",
    ]

    for i,row in dataframe.iterrows():
        # q = question[0]
        # data['id'].append(row['key'])
        # data['context'].append(row['transcript'])
        # data['question'].append(q)
        # data['answers'].append({'text': [str(row['heading'])], 'answer_start': [row['heading_start']]})

        q = questions[1]
        data['id'].append(row['key'])
        data['context'].append(row['transcript'])
        data['question'].append(q)
        data['answers'].append({'text': [str(row['tool'])], 'answer_start': [row['tool_start']]})

        q = questions[2]
        data['id'].append(row['key'])
        data['context'].append(row['transcript'])
        data['question'].append(q)
        data['answers'].append({'text': [str(row['target'])], 'answer_start': [row['target_start']]})

    return data



In [31]:
from datasets import DatasetDict, Dataset

question = "What is the target to engage?"

# Step 1: Define your custom data
data_dict = {
    'train': generate_dataset(train_df),
    'test': generate_dataset(valid_df)
}

# Step 2: Create Dataset objects for each split
dataset = DatasetDict({
    split: Dataset.from_dict(data_dict[split]) for split in data_dict
})

In [32]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=128,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_text = dataset.map(preprocess_function, batched=True)
data_collator = DefaultDataCollator()

Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

In [33]:

training_args = TrainingArguments(
    output_dir="roberta-base-squad2",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_text["train"],
    eval_dataset=tokenized_text["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,5.9e-05
2,0.011100,7e-06
3,0.002200,2.1e-05
4,0.002200,3e-06
5,0.000400,8e-06
6,0.000400,2e-06
7,0.000400,2e-06
8,0.000000,2e-06
9,0.000100,2e-06
10,0.000000,2e-06


TrainOutput(global_step=3500, training_loss=0.0020147876827312367, metrics={'train_runtime': 1475.1842, 'train_samples_per_second': 37.961, 'train_steps_per_second': 2.373, 'total_flos': 3658154594304000.0, 'train_loss': 0.0020147876827312367, 'epoch': 10.0})

In [34]:
context = valid_df['transcript'][365]
print(context)
question = "What is the tool to use?"

# Tokenize the context to find the exact start and end position of the answer
encoded = tokenizer.encode_plus(question, context, return_tensors="pt")
input_ids = encoded["input_ids"].tolist()[0]

model.eval()
with torch.no_grad():
    outputs = model(**encoded.to(device))

answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits) + 1

# Convert tokens to answer string
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
print("Improved Answer:", answer)

Turret Bravo, deploy anti-air artillery targeting the yellow and purple helicopter on heading three one zero.
Improved Answer:  anti-air artillery


In [35]:
metrics = trainer.evaluate()
metrics

{'eval_loss': 1.507467686678865e-06,
 'eval_runtime': 9.5201,
 'eval_samples_per_second': 147.057,
 'eval_steps_per_second': 9.244,
 'epoch': 10.0}

In [36]:
output_dir = f"/content/roberta-base-squad2"

# Save the model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('/content/roberta-base-squad2/tokenizer_config.json',
 '/content/roberta-base-squad2/special_tokens_map.json',
 '/content/roberta-base-squad2/vocab.json',
 '/content/roberta-base-squad2/merges.txt',
 '/content/roberta-base-squad2/added_tokens.json',
 '/content/roberta-base-squad2/tokenizer.json')

In [37]:
!zip -r /content/roberta-base-squad2.zip /content/roberta-base-squad2

  adding: content/roberta-base-squad2/ (stored 0%)
  adding: content/roberta-base-squad2/special_tokens_map.json (deflated 85%)
  adding: content/roberta-base-squad2/checkpoint-2500/ (stored 0%)
  adding: content/roberta-base-squad2/checkpoint-2500/special_tokens_map.json (deflated 85%)
  adding: content/roberta-base-squad2/checkpoint-2500/optimizer.pt (deflated 36%)
  adding: content/roberta-base-squad2/checkpoint-2500/training_args.bin (deflated 51%)
  adding: content/roberta-base-squad2/checkpoint-2500/model.safetensors (deflated 8%)
  adding: content/roberta-base-squad2/checkpoint-2500/config.json (deflated 49%)
  adding: content/roberta-base-squad2/checkpoint-2500/vocab.json (deflated 59%)
  adding: content/roberta-base-squad2/checkpoint-2500/trainer_state.json (deflated 74%)
  adding: content/roberta-base-squad2/checkpoint-2500/merges.txt (deflated 53%)
  adding: content/roberta-base-squad2/checkpoint-2500/rng_state.pth (deflated 25%)
  adding: content/roberta-base-squad2/checkpo