# Instruction tuning (Finetuning) using custom dataset on GPT2

## 00. Setup packages and import all reqired settings

In [None]:
import os
import getpass
#os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  # Arrange GPU devices starting from 0
#os.environ["CUDA_VISIBLE_DEVICES"]= "1"  # Set the GPU 1 to use
#os.environ["HUGGING_FACE_HUB_TOKEN"] = getpass.getpass("Token:")
#assert os.environ["HUGGING_FACE_HUB_TOKEN"]

In [None]:
!pip install transformers datasets accelerate -qqq

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM # GPT2TokenizerFast, GPT2LMHeadModel
from datasets import load_dataset

## 01. Data Load

In [None]:
train_dataset = load_dataset('Aeala/ShareGPT_Vicuna_unfiltered', split="train[:5000]")

In [None]:
train_dataset[0]["conversations"]

## 02. Load Tokenizer Setup

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

## 03. Preprocessing (Tokenization and Preprocessing for Causal Language Modeling)

### 03-1. Tokenize for all samples

In [None]:
def tokenizer_fuction(samples):
  sample = samples["conversations"]
  result = ""
  for sample in sample:
    if sample["from"] == "human":
      result = result + "USER: " + sample["value"] + " \n"
    else:
      result = result + "CHATBOT: " + sample["value"] + " \n"

  return tokenizer(result, padding="max_length", truncation=True, max_length=256)

In [None]:
tokenized_dataset = train_dataset.map(
    tokenizer_fuction,
    remove_columns=["conversations", "id"]
)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

### 03-2. Data Preperation for Causal Language Modeling (next token prediction)

In [None]:
from transformers import DataCollatorForLanguageModeling
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

## 04. Load Pretrained Model and Generate sentences in initial settings

In [None]:
model = AutoModelForCausalLM.from_pretrained('gpt2')

In [None]:
def gen_function(prompt, model, tokenizer):

    # 1) Prompt
    input_text = prompt
    # 2) Tokenizing and Tensor transformation
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    input_ids = input_ids.to('cuda')
    # 3) Generate texts
    max_length = 100
    model = model.to("cuda")
    sample_outputs = model.generate(input_ids, do_sample=True, max_length=max_length, temperature=0.7)
    # 4) Decoding texts
    return tokenizer.decode(sample_outputs[0], skip_special_tokens=True)


In [None]:
output = gen_function("Could you give me some examples of Numpy array?", model, tokenizer)
print(output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Could you give me some examples of Numpy array?

It is the case that if you have a collection of elements, it is possible to store them in a Numpy array.

The following example illustrates how to use an array of elements to store a number of numbers in an array.

import numpy as np as np from numpy.collection import Numpy as np from numpy.random import shuffle from numpy.dict import Dict as Dict from


## 05. Train with Trainer and TrainingArguments

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
	output_dir="./gpt2_instruction_tuning",
	overwrite_output_dir=True,
	num_train_epochs=1,
	per_device_train_batch_size=8,
	save_steps=1000,
	save_total_limit=2,
)

In [None]:
trainer = Trainer(
	model=model,
	args=training_args,
	data_collator=collator,
	train_dataset=tokenized_dataset,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()

Step,Training Loss
500,2.4525


TrainOutput(global_step=625, training_loss=2.439342578125, metrics={'train_runtime': 94.1489, 'train_samples_per_second': 53.107, 'train_steps_per_second': 6.638, 'total_flos': 653230080000000.0, 'train_loss': 2.439342578125, 'epoch': 1.0})

In [None]:
output = gen_function("Could you give me some examples of Numpy array?", model, tokenizer)
print(output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Could you give me some example of Numpy array? 
CHATBOT: Sure, here's an example of a Numpy array:
```python
import numpy as np
import pandas as pd
import numpy as np
import re
import re
from re import numpy.algorithm
from numpy.globals import load_data

# Load Data
data = load_data.text()
data = np.array([
