<a href="https://colab.research.google.com/github/joshIsac/LargeLanguageModel/blob/main/2348523_LLM__Transfer_learning_Lab_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
!pip install accelerate -U

Successfully installed accelerate-0.30.0 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.127 nvidia-nvtx-cu12-12.1.105


In [26]:
from datasets import load_dataset

In [27]:
squad = load_dataset('squad',split="train[:500]")

In [28]:
squad

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 500
})

In [29]:
set(squad['title']),squad

({'Beyoncé', 'University_of_Notre_Dame'},
 Dataset({
     features: ['id', 'title', 'context', 'question', 'answers'],
     num_rows: 500
 }))

In [30]:
squad[15]

{'id': '5733a6424776f41900660f51',
 'title': 'University_of_Notre_Dame',
 'context': 'The College of Engineering was established in 1920, however, early courses in civil and mechanical engineering were a part of the College of Science since the 1870s. Today the college, housed in the Fitzpatrick, Cushing, and Stinson-Remick Halls of Engineering, includes five departments of study – aerospace and mechanical engineering, chemical and biomolecular engineering, civil engineering and geological sciences, computer science and engineering, and electrical engineering – with eight B.S. degrees offered. Additionally, the college offers five-year dual degree programs with the Colleges of Arts and Letters and of Business awarding additional B.A. and Master of Business Administration (MBA) degrees, respectively.',
 'question': 'How many BS level degrees are offered in the College of Engineering at Notre Dame?',
 'answers': {'text': ['eight'], 'answer_start': [487]}}

In [31]:
import transformers
from transformers import AutoModelForQuestionAnswering,AutoTokenizer
import pandas
import torch


In [32]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased",
                                          force_download=False)

In [34]:
squad = squad.train_test_split(test_size=0.2)

In [35]:

def preprocess_function(examples):

  questions = [q.strip() for q in examples["question"]]
  inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

  offset_mapping = inputs.pop("offset_mapping")
  answers = examples["answers"]
  start_positions = []
  end_positions = []

  for i, offset in enumerate(offset_mapping):
    answer = answers[i]
    start_char = answer["answer_start"][0]
    end_char = answer["answer_start"][0] + len(answer["text"][0])
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
      idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
      idx += 1
    context_end = idx - 1

    # If the answer is not fully inside the context, label it (0, 0)
    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
      start_positions.append(0)
      end_positions.append(0)
    else:
      # Otherwise it's the start and end token positions
      idx = context_start
      while idx <= context_end and offset[idx][0] <= start_char:
        idx += 1
      start_positions.append(idx - 1)

      idx = context_end
      while idx >= context_start and offset[idx][1] >= end_char:
        idx -= 1
      end_positions.append(idx + 1)

  inputs["start_positions"] = start_positions
  inputs["end_positions"] = end_positions
  return inputs

In [36]:
tokenized_inputs = squad.map(preprocess_function, batched=True)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
tokenized_inputs

In [38]:
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator()

In [39]:
model="bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForQuestionAnswering.from_pretrained(model)



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="fine_tuned",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    logging_steps=1,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_inputs["train"],
    eval_dataset=tokenized_inputs["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [41]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.9962,2.791179
2,1.8147,2.16957
3,0.9221,2.558807
4,0.2259,2.760242
5,0.2677,2.881892
6,0.3883,3.02956
7,0.2587,3.554065
8,0.023,3.362982


TrainOutput(global_step=200, training_loss=1.0214060619706289, metrics={'train_runtime': 277.2905, 'train_samples_per_second': 11.54, 'train_steps_per_second': 0.721, 'total_flos': 627112216166400.0, 'train_loss': 1.0214060619706289, 'epoch': 8.0})

In [42]:
import torch
def ask_a_question(qa):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  inputs = tokenizer.encode_plus(qa['question'], qa['context'], add_special_tokens=True, return_tensors="pt")
  inputs.to(device)
  outputs = model(**inputs)
  start_index = torch.argmax(outputs['start_logits'])
  end_index = torch.argmax(outputs['end_logits'])+1
  answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index]))
  return answer


In [44]:
QA={
    'question': 'How many BS level degrees are offered in the College of Engineering at Notre Dame?',
    'context': "The College of Engineering was established in 1920, however, early courses in civil and mechanical engineering were a part of the College of Science since the 1870s. Today the college, housed in the Fitzpatrick, Cushing, and Stinson-Remick Halls of Engineering, includes five departments of study – aerospace and mechanical engineering, chemical and biomolecular engineering, civil engineering and geological sciences, computer science and engineering, and electrical engineering – with eight B.S. degrees offered. Additionally, the college offers five-year dual degree programs with the Colleges of Arts and Letters and of Business awarding additional B.A. and Master of Business Administration (MBA) degrees, respectively."
}
print(ask_a_question(QA))

eight
