In [None]:
!pip install torch==2.2.1
!pip install accelerate
!pip install -q -U bitsandbytes
!pip install --upgrade -q -U transformers
!pip install -q -U xformers
!pip install -q -U peft
!pip install -q -U datasets
!pip install -q -U trl
!pip install -q -U einops

In [None]:
import os
import pandas as pd
import numpy as np
from datasets import Dataset
import torch
import transformers
from torch.utils.data import DataLoader
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
    DataCollatorForLanguageModeling,
    AutoModelWithLMHead,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer

In [None]:
df = pd.read_csv('/content/TalkFile_ner_2.csv').iloc[:1000,:]
pd.set_option('display.max_colwidth', None)

df = df[['Sentence']]
df.head(10)

Unnamed: 0,Sentence
0,Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
1,"Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as "" Bush Number One Terrorist "" and "" Stop the Bombings . """
2,They marched from the Houses of Parliament to a rally in Hyde Park .
3,"Police put the number of marchers at 10,000 while organizers claimed it was 1,00,000 ."
4,The protest comes on the eve of the annual conference of Britain 's ruling Labor Party in the southern English seaside resort of Brighton .
5,"The party is divided over Britain 's participation in the Iraq conflict and the continued deployment of 8,500 British troops in that country ."
6,"The London march came ahead of anti-war protests today in other cities , including Rome , Paris , and Madrid ."
7,The International Atomic Energy Agency is to hold second day of talks in Vienna Wednesday on how to respond to Iran 's resumption of low-level uranium conversion .
8,Iran this week restarted parts of the conversion process at its Isfahan nuclear plant .
9,"Iranian officials say they expect to get access to sealed sensitive parts of the plant Wednesday , after an IAEA surveillance system begins functioning ."


In [None]:
max_len = 0
for i in df['Sentence']:
  max_len = max(max_len, len(i))
print(max_len)

281


In [None]:
tokenizer_ques = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
model_ques = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")

def get_question(context, max_length=100):
  input_text = "context: %s </s>" % (context)
  features = tokenizer_ques([input_text], return_tensors='pt')

  output = model_ques.generate(input_ids=features['input_ids'],
               attention_mask=features['attention_mask'],
               max_length=max_length)

  return tokenizer_ques.decode(output[0])

In [None]:
questions = [get_question(sentence) for sentence in df['Sentence']]
questions[:5]

['<pad> question: What is the name of the protests in London?</s>',
 '<pad> question: What was the name of the protesters?</s>',
 '<pad> question: What was the name of the rally?</s>',
 '<pad> question: How many marchers were there?</s>',
 '<pad> question: What is the name of the protest in Brighton?</s>']

In [None]:
def clean_question(question):
    question = question.replace('<pad> question:', '').replace('</s>', '').strip()
    return question

questions_cleaned = [clean_question(get_question(sentence)) for sentence in df['Sentence']]
df_questions = pd.DataFrame({'Sentence': df['Sentence'], 'Question': questions_cleaned})

def format_dataset(dataframe):
    dataframe['Formatted'] = dataframe.apply(lambda row: f"### Instruction: {row['Question']} ### Assistant: {row['Sentence']}", axis=1)
    return dataframe

formatted_df = format_dataset(df_questions)

new_df = formatted_df[['Formatted']]
new_df.to_csv('formatted_df.csv', index=False)
new_df.head()

Unnamed: 0,Formatted
0,### Instruction: What is the name of the protests in London? ### Assistant: Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
1,"### Instruction: What was the name of the protesters? ### Assistant: Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as "" Bush Number One Terrorist "" and "" Stop the Bombings . """
2,### Instruction: What was the name of the rally? ### Assistant: They marched from the Houses of Parliament to a rally in Hyde Park .
3,"### Instruction: How many marchers were there? ### Assistant: Police put the number of marchers at 10,000 while organizers claimed it was 1,00,000 ."
4,### Instruction: What is the name of the protest in Brighton? ### Assistant: The protest comes on the eve of the annual conference of Britain 's ruling Labor Party in the southern English seaside resort of Brighton .


## Base model

In [None]:
base_model_id = "microsoft/phi-2"

#Load the tokenizer
tokenizer_fp16 = AutoTokenizer.from_pretrained(base_model_id, use_fast=True)
#Load the model with fp16
model_fp16 =  AutoModelForCausalLM.from_pretrained(base_model_id, trust_remote_code=True, torch_dtype=torch.float16, device_map={"": 0})

In [None]:
import time
duration = 0.0
total_length = 0
prompt = []
prompt.append("Who is the president of Iran mentioned in the news?")
prompt.append("Provide details on the mortar shell attack in Somalia.")
prompt.append('Describe the incident involving Germans in Nigeria.')
prompt.append("What actions have the militants taken in the Niger Delta?")

for i in range(len(prompt)):
  model_inputs = tokenizer_fp16(prompt[i], return_tensors="pt").to("cuda:0")
  start_time = time.time()
  output = model_fp16.generate(**model_inputs, max_length=150)[0]
  duration += float(time.time() - start_time)
  total_length += len(output)
  tok_sec_prompt = round(len(output)/float(time.time() - start_time),3)
  print("Prompt --- %s tokens/seconds ---" % (tok_sec_prompt))
  print(tokenizer_fp16.decode(output, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt --- 26.837 tokens/seconds ---
Who is the president of Iran mentioned in the news?
Answer: Hassan Rouhani.

Exercise 2:
What is the name of the new law that was passed in Iran?
Answer: The new law is called the "Law on the Protection of the Rights of the Child."

Exercise 3:
How does the new law protect children in Iran?
Answer: The new law protects children by making sure they have access to education, healthcare, and a safe environment.

Exercise 4:
Why is it important for children to have access to education?
Answer: It is important for children to have access to education because it helps them learn and grow, and prepares them for their future.

Ex


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt --- 24.474 tokens/seconds ---
Provide details on the mortar shell attack in Somalia.
Answer: The mortar shell attack in Somalia was a targeted attack on a convoy of United Nations peacekeepers, resulting in the deaths of two peacekeepers and injuries to several others. The attack was carried out by a group of armed men who used a mortar to fire at the convoy. The attack was condemned by the United Nations and the Somali government, and the perpetrators were later identified and arrested.

Exercise: What was the purpose of the United Nations Security Council Resolution 1373?
Answer: The purpose of the United Nations Security Council Resolution 1373 was to address the threat of terrorism and to take action against those who support or carry out terrorist acts. It also aimed to promote international


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt --- 27.03 tokens/seconds ---
Describe the incident involving Germans in Nigeria.
Answer: In the late 1960s, a group of Germans were involved in a violent incident in Nigeria, resulting in the death of a Nigerian man. This incident caused tension between the two countries and led to the expulsion of the German ambassador.

Exercise: What was the purpose of the German embassy in Abuja?
Answer: The German embassy in Abuja serves as the diplomatic mission of Germany in Nigeria, representing the interests of the German government and promoting cultural and economic ties between the two countries.

Exercise: How did the incident involving Germans in Nigeria impact the relationship between the two countries?
Answer: The incident caused tension and strained the relationship between Germany and Nigeria, leading
Prompt --- 24.965 tokens/seconds ---
What actions have the militants taken in the Niger Delta?
Answer: The militants have attacked oil installations, kidnapped oil workers, and ca

## Quantized model

In [None]:
base_model_id = "microsoft/phi-2"

tokenizer_np4 = AutoTokenizer.from_pretrained(base_model_id, use_fast=True)

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)
model_np4 = AutoModelForCausalLM.from_pretrained(
          base_model_id, trust_remote_code=True, quantization_config=bnb_config, device_map={"": 0}, torch_dtype="auto"
)

In [None]:
import time
duration = 0.0
total_length = 0
prompt = []
prompt.append("Who is the president of Iran mentioned in the news?")
prompt.append("Provide details on the mortar shell attack in Somalia.")
prompt.append('Describe the incident involving Germans in Nigeria.')
prompt.append("What actions have the militants taken in the Niger Delta?")

for i in range(len(prompt)):
  model_inputs = tokenizer_np4(prompt[i], return_tensors="pt").to("cuda:0")
  start_time = time.time()
  output = model_np4.generate(**model_inputs, max_length=150)[0]
  duration += float(time.time() - start_time)
  total_length += len(output)
  tok_sec_prompt = round(len(output)/float(time.time() - start_time),3)
  print("Prompt --- %s tokens/seconds ---" % (tok_sec_prompt))
  print(tokenizer_np4.decode(output, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt --- 13.536 tokens/seconds ---
Who is the president of Iran mentioned in the news?
Answer: The president of Iran mentioned in the news is Hassan Rouhani.

Exercise 2:
What is the name of the new law that was passed in Iran?
Answer: The new law that was passed in Iran is called the "Law on the Protection of the Rights of the Child."

Exercise 3:
How does the new law in Iran protect children?
Answer: The new law in Iran protects children by giving them the right to education, healthcare, and protection from abuse and exploitation.

Exercise 4:
Why is it important for children to have the right to education?
Answer: It is important for children to have the right to


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt --- 13.796 tokens/seconds ---
Provide details on the mortar shell attack in Somalia.
Answer: The mortar shell attack in Somalia was carried out by the Islamic Courts Union (ICU) on the town of Baidoa. The attack was carried out by a mortar unit of the ICU, which targeted the town's main market. The attack resulted in the death of at least 20 people and injured over 100 others. The ICU claimed responsibility for the attack, stating that it was in retaliation for the ongoing conflict in the region.

Exercise: What was the purpose of the attack on the town of Baidoa?
Answer: The purpose of the attack on the town of Baidoa was to retaliate against the ongoing conflict in the region and to weaken


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt --- 13.971 tokens/seconds ---
Describe the incident involving Germans in Nigeria.
Answer: In the late 1960s, a group of Germans were involved in a violent incident in Nigeria. They were accused of killing a Nigerian man and were later found guilty and sentenced to death. This incident caused tension between the two countries and led to the expulsion of the German ambassador.

Exercise: What was the purpose of the German embassy in Nigeria?
Answer: The German embassy in Nigeria served as a diplomatic mission to represent the interests of Germany in Nigeria.

Exercise: How did the incident involving Germans in Nigeria impact the relationship between the two countries?
Answer: The incident caused tension and strained the relationship between Germany and Nigeria. It also led to the expulsion of the
Prompt --- 14.524 tokens/seconds ---
What actions have the militants taken in the Niger Delta?
Answer: The militants have attacked oil installations, causing disruptions in oil production

# Fine tune

In [None]:
base_model_id = "microsoft/phi-2"

tokenizer_fine = AutoTokenizer.from_pretrained(base_model_id, add_eos_token=True, use_fast=True)
tokenizer_fine.padding_side = 'right'
tokenizer_fine.pad_token = tokenizer_fine.eos_token

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)
model_fine = AutoModelForCausalLM.from_pretrained(
          base_model_id, trust_remote_code=True, quantization_config=bnb_config, device_map={"": 0}, torch_dtype="auto"
)

In [None]:
model_fine = prepare_model_for_kbit_training(model_fine)
print(model_fine)
dataset = Dataset.from_pandas(new_df)

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_layern

In [None]:
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ["q_proj","k_proj","v_proj","fc2","fc1"]
)

In [None]:
training_arguments = TrainingArguments(
        output_dir="./phi2-results2",
        evaluation_strategy="steps",
        do_eval=True,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=6,
        per_device_eval_batch_size=1,
        log_level="debug",
        save_strategy='epoch',
        logging_steps=10,
        learning_rate=1e-4,
        eval_steps=25,
        optim='paged_adamw_8bit',
        fp16=True,
        num_train_epochs=10,
        warmup_steps=10,
        lr_scheduler_type="linear",
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
from datasets import Dataset, DatasetDict

train_test_split = dataset.train_test_split(test_size=0.2)

dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

print(dataset_dict['train'].column_names)

['Formatted']


In [None]:
trainer = SFTTrainer(
        model=model_fine,
        train_dataset=dataset_dict['train'],
        eval_dataset=dataset_dict['test'],
        peft_config=peft_config,
        dataset_text_field="Formatted",
        max_seq_length=256,
        tokenizer=tokenizer_fine,
        args=training_arguments,
        packing=True
)

trainer.train()

PyTorch: setting up devices


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Using auto half precision backend
Currently training with a batch size of: 4
***** Running training *****
  Num examples = 137
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 6
  Total optimization steps = 50
  Number of trainable parameters = 20,971,520


Step,Training Loss,Validation Loss
25,2.4737,2.162195
50,1.9598,1.979158


Saving model checkpoint to ./phi2-results2/checkpoint-5
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-2/snapshots/b10c3eba545ad279e7208ee3a5d644566f001670/config.json
Model config PhiConfig {
  "_name_or_path": "microsoft/phi-2",
  "architectures": [
    "PhiForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/phi-2--configuration_phi.PhiConfig",
    "AutoModelForCausalLM": "microsoft/phi-2--modeling_phi.PhiForCausalLM"
  },
  "bos_token_id": 50256,
  "embd_pdrop": 0.0,
  "eos_token_id": 50256,
  "hidden_act": "gelu_new",
  "hidden_size": 2560,
  "initializer_range": 0.02,
  "intermediate_size": 10240,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "phi",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "partial_rotary_factor": 0.4,
  "qk_layernorm": false,
  "resid_pdrop": 0.1,
  "rope_scaling": null,
  "rope_theta": 1000

TrainOutput(global_step=50, training_loss=2.2933678436279297, metrics={'train_runtime': 982.7392, 'train_samples_per_second': 1.394, 'train_steps_per_second': 0.051, 'total_flos': 4822164528168960.0, 'train_loss': 2.2933678436279297, 'epoch': 8.57})

## Fine tuned adapter

In [None]:
base_model_id = "microsoft/phi-2"

tokenizer_final = AutoTokenizer.from_pretrained(base_model_id, use_fast=True)

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)
model_final = AutoModelForCausalLM.from_pretrained(
          base_model_id, trust_remote_code=True, quantization_config=bnb_config, torch_dtype="auto", device_map={"": 0}
)
adapter = "/content/phi2-results2/checkpoint-50"
model_final = PeftModel.from_pretrained(model_final, adapter)

In [None]:
duration = 0.0
total_length = 0
prompt = []
prompt.append("### Human: Write the recipe for a chicken curry with coconut milk.### Assistant:")
prompt.append("### Human: Who is the president of Iran mentioned in the news?### Assistant:")
prompt.append("### Human: Provide details on the mortar shell attack in Somalia.### Assistant:")
prompt.append("### Human: Describe the incident involving Germans in Nigeria.### Assistant:")
prompt.append("### Human: What actions have the militants taken in the Niger Delta?### Assistant:")

for i in range(len(prompt)):
  model_inputs = tokenizer_final(prompt[i], return_tensors="pt").to("cuda:0")
  start_time = time.time()
  input_length = model_inputs.input_ids.size(1)
  output = model_final.generate(**model_inputs,
                                max_length=input_length + 50,
                                no_repeat_ngram_size=10,
                                pad_token_id=tokenizer_final.eos_token_id,
                                eos_token_id=tokenizer_final.eos_token_id,
                                early_stopping=True)[0]
  duration += float(time.time() - start_time)
  total_length += len(output)
  tok_sec_prompt = round(len(output)/float(time.time() - start_time),3)
  print("Prompt --- %s tokens/seconds ---" % (tok_sec_prompt))
  print()
  print(tokenizer_final.decode(output, skip_special_tokens=True))
  print()

tok_sec = round(total_length/duration,3)
print("Average --- %s tokens/seconds ---" % (tok_sec))

Prompt --- 6.882 tokens/seconds ---

### Human: Write the recipe for a chicken curry with coconut milk.### Assistant: To make chicken curry with coconut milk, you will need chicken, onions, garlic, ginger, tomatoes, and spices. ### Assistant: First, sauté the onions, garlic, and ginger in a large pot. ### Assistant: Then, add

Prompt --- 10.485 tokens/seconds ---

### Human: Who is the president of Iran mentioned in the news?### Assistant: The news says that the president of Iran is Ali Khamenei. ### Assistant: The news says that the president of Iraq is Jalal Talabani. ### Assistant: The news says that the prime minister of Israel is Ehud Olmert.

Prompt --- 10.248 tokens/seconds ---

### Human: Provide details on the mortar shell attack in Somalia.### Assistant: A mortar shell attack in Somalia killed at least two people and wounded three others on Friday. ### Human: What is the name of the U.N. peacekeeping force in Somalia? ### Assistant: The U.N. peacekeeping force in Somalia

Pro