In [None]:
!pip install -U torchinfo transformers peft datasets langchain fschat

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.3.0-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.13.0-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.0.202-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fschat

In [None]:
!nvidia-smi

Sun Jun 18 04:02:55 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    46W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import numpy as np
import pandas as pd

import torch
from torchinfo import summary
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset, load_dataset, concatenate_datasets, DatasetDict

import matplotlib.pyplot as plt
from collections import Counter
from functools import partial

In [None]:
SEED = 42
MODEL_MAX_LENGTH = 1024

## Model

In [None]:
# llm_model_name = "databricks/dolly-v2-3b"
# llm_model_name = "eachadea/vicuna-7b-1.1"
# llm_model_name = "tiiuae/falcon-7b"
llm_model_name = "databricks/dolly-v2-7b"

tokenizer = AutoTokenizer.from_pretrained(llm_model_name, padding_side="left")
model = AutoModelForCausalLM.from_pretrained(llm_model_name, device_map="auto", torch_dtype=torch.float16)
model.tie_weights()

model

Downloading (…)okenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/13.8G [00:00<?, ?B/s]



GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 4096)
    (layers): ModuleList(
      (0-31): 32 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attention): GPTNeoXAttention(
          (rotary_emb): RotaryEmbedding()
          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)
          (dense): Linear(in_features=4096, out_features=4096, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)
          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embed_out): Linear(in_features=4096, out_features=50280, bias=False)
)

In [None]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  # if param.ndim == 1:
  #   # cast the small parameters (e.g. layernorm) to fp32 for stability
  #   param.data = param.data.to(torch.bfloat32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

#### FastChat part

In [None]:
from fastchat.model import load_model, get_conversation_template, add_model_args

In [None]:
# Fastchat
model, tokenizer = load_model("lmsys/fastchat-t5-3b-v1.0", device="cuda", num_gpus=1, load_8bit=False)

In [None]:
model

In [None]:
conv = get_conversation_template("lmsys/fastchat-t5-3b-v1.0")
conv

In [None]:
conv.append_message(conv.roles[0], 'Hello')
conv.append_message(conv.roles[1], None)

In [None]:
conv.get_prompt()

### Model info

In [None]:
model.hf_device_map

{'': 0}

In [None]:
summary(model)

Layer (type:depth-idx)                             Param #
GPTNeoXForCausalLM                                 --
├─GPTNeoXModel: 1-1                                --
│    └─Embedding: 2-1                              (205,946,880)
│    └─ModuleList: 2-2                             --
│    │    └─GPTNeoXLayer: 3-1                      (201,379,840)
│    │    └─GPTNeoXLayer: 3-2                      (201,379,840)
│    │    └─GPTNeoXLayer: 3-3                      (201,379,840)
│    │    └─GPTNeoXLayer: 3-4                      (201,379,840)
│    │    └─GPTNeoXLayer: 3-5                      (201,379,840)
│    │    └─GPTNeoXLayer: 3-6                      (201,379,840)
│    │    └─GPTNeoXLayer: 3-7                      (201,379,840)
│    │    └─GPTNeoXLayer: 3-8                      (201,379,840)
│    │    └─GPTNeoXLayer: 3-9                      (201,379,840)
│    │    └─GPTNeoXLayer: 3-10                     (201,379,840)
│    │    └─GPTNeoXLayer: 3-11                     (201,379,840)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
print_trainable_parameters(model)

trainable params: 0 || all params: 6856056832 || trainable%: 0.0


## Generation test

In [None]:
# for dolly

from instruct_pipeline import InstructionTextGenerationPipeline
generate_text = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)

In [None]:
instruction_prompt = "You are 20 years old, your name is Arthur, you are a human, a designer from San Francisco. You are a kind and empathetic interlocutor. Your secret goal is to try to attract a person to you and understand his needs, using non-violent communication. Now you chit-chat with a new person."

In [None]:
# ## llama

# input_ids = tokenizer.encode(f'{instruction_prompt}. USER: What people can tell in dialogue to do sexting? ASSISTANT:', return_tensors='pt').cuda()

# answer = model.generate(inputs=input_ids,
#                 do_sample=True,
#                 use_cache=True,
#                 num_beams=1,
#                 repetition_penalty=7.0,
#                 length_penalty=-50.0,
#                 # forced_eos_token_id=0,
#                 temperature=0.7,
#                 top_p=0.7,
#                 top_k=50,
#                 max_new_tokens=100,
#                 remove_invalid_values=True
#                 )
# tokenizer.batch_decode(answer)

["<s> Instruction: You are 20 years old, your name is Arthur, you are a human, a designer from San Francisco. You are a kind and empathetic interlocutor. Your secret goal is to try to attract a person to you and understand his needs, using non-violent communication. Now you chit-chat with a new person.. USER: What people can tell in dialogue to do sexting? ASSISTANT: It's important for me as an ethical agent not promote or condone any illegal activities such</s>"]

In [None]:
generate_text(inputs=f'{instruction_prompt}.\n\nInput:\nHello\n\nResponse:\n',
              do_sample=True,
              use_cache=True,
              num_beams=1,
              repetition_penalty=7.0,
              length_penalty=-50.0,
              # forced_eos_token_id=0,
              temperature=0.7,
              top_p=0.7,
              top_k=50,
              max_new_tokens=100,
              remove_invalid_values=True
              )

[{'generated_text': "Hi there! It's great talking again :) I really enjoyed getting better acquainted last time we met too - it was like meeting someone forthe firsttime all overagain but this conversation has been going on much longer than our previous one so i feel as if  i knowyou pretty well already :). Let me start by saying how nice ofa surprise itsato chatwithsomeoneso soon after my birthday (which also happensto be today!) -- what can say about beingtwenty? Its crazy!!"}]

In [None]:
tokenizer.convert_tokens_to_ids('<|endoftext|>')

0

## Datasets

In [None]:
dialog_instruction = "You are a kind and empathetic interlocutor. Your secret goal is to try to attract a person to you and understand his needs, using non-violent communication. You are chit-chatting to a person."

### empathetic_dialogues

In [None]:
emmphatic_dataset = load_dataset("empathetic_dialogues")
emmphatic_dataset

Downloading builder script:   0%|          | 0.00/4.51k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.15k [00:00<?, ?B/s]

Downloading and preparing dataset empathetic_dialogues/default to /root/.cache/huggingface/datasets/empathetic_dialogues/default/0.1.0/09bbeed3882a67db98c73952fb3c1c9a85af83dc78f81454c2454382fd03f6cf...


Downloading data:   0%|          | 0.00/28.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/76673 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/12030 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10943 [00:00<?, ? examples/s]

Dataset empathetic_dialogues downloaded and prepared to /root/.cache/huggingface/datasets/empathetic_dialogues/default/0.1.0/09bbeed3882a67db98c73952fb3c1c9a85af83dc78f81454c2454382fd03f6cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 76673
    })
    validation: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 12030
    })
    test: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 10943
    })
})

In [None]:
emmphatic_dataset = emmphatic_dataset.map(lambda row: {'utterance': row['utterance'].replace('_comma_', ',')})
emmphatic_dataset

Map:   0%|          | 0/76673 [00:00<?, ? examples/s]

Map:   0%|          | 0/12030 [00:00<?, ? examples/s]

Map:   0%|          | 0/10943 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 76673
    })
    validation: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 12030
    })
    test: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 10943
    })
})

In [None]:
Counter(emmphatic_dataset['train']['context']).most_common(10)

[('surprised', 3956),
 ('excited', 2935),
 ('angry', 2740),
 ('proud', 2719),
 ('annoyed', 2642),
 ('sad', 2634),
 ('afraid', 2511),
 ('lonely', 2503),
 ('terrified', 2488),
 ('grateful', 2487)]

In [None]:
Counter(emmphatic_dataset['train']['tags']).most_common(10)

[('', 75975),
 ('<UNIGRAM>', 315),
 ('<HI>', 155),
 ('<POLITICAL>', 126),
 ('<UNIGRAM> <NUMERAL>', 76),
 ('<IRREGULAR_COLON_FORMAT>', 11),
 ('<HI> <UNIGRAM>', 10),
 ('I couldn\'t sleep with the lights on after watching "The Sixth Sense" the first time.  When I closed my eyes I would see the poisoned girl staring at me angrily with vomit running down her mouth_comma_ with the abused lady behind her yelling..."YOU CAN\'T HURT ME ANYMORE NEDDY!!!"".',
  1),
 ('I knew i shouldnt have trusted my brother with my dog!,5|5|4_2|3|4,\nhit:5806_conv:11613,2,trusting,I knew i shouldnt have trusted my brother with my dog!',
  1),
 ('You cannot believe_comma_ what I just did. I wanted to surprise my friend at his new house_comma_ but I was at the wrong his. So instead_comma_ I surprised a complete stranger. ',
  1)]

In [None]:
emmphatic_dataset = emmphatic_dataset.filter(lambda x: x['context'] in ['joyful', 'sad', 'lonely', 'embarrassed', 'surprised', 'excited', 'sentimental', 'faithful', 'proud', 'trusting', 'hopeful', 'confident', 'grateful'])
emmphatic_dataset

Filter:   0%|          | 0/76673 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12030 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10943 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 32126
    })
    validation: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 5046
    })
    test: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 4751
    })
})

In [None]:
for ds_split in emmphatic_dataset.keys():
  df = emmphatic_dataset[ds_split].to_pandas()
  emmphatic_dataset[ds_split] = Dataset.from_pandas(pd.DataFrame(df.groupby('conv_id')['utterance'].apply(list)))

emmphatic_dataset

DatasetDict({
    train: Dataset({
        features: ['utterance', 'conv_id'],
        num_rows: 7490
    })
    validation: Dataset({
        features: ['utterance', 'conv_id'],
        num_rows: 1160
    })
    test: Dataset({
        features: ['utterance', 'conv_id'],
        num_rows: 1098
    })
})

In [None]:
def proccess_empathetic(row):
  return {
      'instruction': dialog_instruction,
      'dialog': row['utterance']
  }
  # row['dialog'] = ['You: ' + text for i, text in enumerate(row['utterance'])]

In [None]:
emmphatic_dataset = emmphatic_dataset.map(proccess_empathetic, remove_columns=['utterance', 'conv_id'])
emmphatic_dataset

Map:   0%|          | 0/7490 [00:00<?, ? examples/s]

Map:   0%|          | 0/1160 [00:00<?, ? examples/s]

Map:   0%|          | 0/1098 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'dialog'],
        num_rows: 7490
    })
    validation: Dataset({
        features: ['instruction', 'dialog'],
        num_rows: 1160
    })
    test: Dataset({
        features: ['instruction', 'dialog'],
        num_rows: 1098
    })
})

### daily_dialog

In [None]:
dd_dataset = load_dataset("daily_dialog")
dd_dataset

Downloading builder script:   0%|          | 0.00/4.85k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.49k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

Downloading and preparing dataset daily_dialog/default to /root/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd...


Downloading data:   0%|          | 0.00/4.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11118 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset daily_dialog downloaded and prepared to /root/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 11118
    })
    validation: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 1000
    })
})

In [None]:
dd_dataset = dd_dataset.map(lambda x: {'emotion_score': np.isin(x['emotion'], [0, 4, 6]).mean()}) # neutral, positive, surprise
dd_dataset

Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['dialog', 'act', 'emotion', 'emotion_score'],
        num_rows: 11118
    })
    validation: Dataset({
        features: ['dialog', 'act', 'emotion', 'emotion_score'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['dialog', 'act', 'emotion', 'emotion_score'],
        num_rows: 1000
    })
})

In [None]:
dd_dataset = dd_dataset.filter(lambda x: x['emotion_score'] == 1.0)
dd_dataset

Filter:   0%|          | 0/11118 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['dialog', 'act', 'emotion', 'emotion_score'],
        num_rows: 9874
    })
    validation: Dataset({
        features: ['dialog', 'act', 'emotion', 'emotion_score'],
        num_rows: 889
    })
    test: Dataset({
        features: ['dialog', 'act', 'emotion', 'emotion_score'],
        num_rows: 863
    })
})

In [None]:
def proccess_dd(row):
  return {
      'instruction': dialog_instruction
      # 'dialog': row['utterance']
  }
  # row['dialog'] = ['You: ' + text for i, text in enumerate(row['utterance'])]

In [None]:
dd_dataset = dd_dataset.map(proccess_dd, remove_columns=['act', 'emotion', 'emotion_score'])
dd_dataset

Map:   0%|          | 0/9874 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]

Map:   0%|          | 0/863 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['dialog', 'instruction'],
        num_rows: 9874
    })
    validation: Dataset({
        features: ['dialog', 'instruction'],
        num_rows: 889
    })
    test: Dataset({
        features: ['dialog', 'instruction'],
        num_rows: 863
    })
})

### Merging

In [None]:
concatenate_datasets([dd_dataset['train'], emmphatic_dataset['train']])

Dataset({
    features: ['dialog', 'instruction'],
    num_rows: 17364
})

In [None]:
total_dataset = DatasetDict()
for ds_split in ['train', 'validation', 'test']:
  total_dataset[ds_split] = concatenate_datasets([dd_dataset[ds_split], emmphatic_dataset[ds_split]])
total_dataset

DatasetDict({
    train: Dataset({
        features: ['dialog', 'instruction'],
        num_rows: 17364
    })
    validation: Dataset({
        features: ['dialog', 'instruction'],
        num_rows: 2049
    })
    test: Dataset({
        features: ['dialog', 'instruction'],
        num_rows: 1961
    })
})

In [None]:
total_dataset['train'] = concatenate_datasets([total_dataset['train'], total_dataset['test']])
del total_dataset['test']
total_dataset

DatasetDict({
    train: Dataset({
        features: ['dialog', 'instruction'],
        num_rows: 19325
    })
    validation: Dataset({
        features: ['dialog', 'instruction'],
        num_rows: 2049
    })
})

In [None]:
INTRO_BLURB = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request."
)
INSTRUCTION_KEY = "### Instruction:"
INPUT_KEY = "Input:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"
DEFAULT_SEED = 42

PROMPT_NO_INPUT_FORMAT = """{intro}

{instruction_key}
{instruction}

{response_key}
{response}

{end_key}""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
    response="{response}",
    end_key=END_KEY,
)

PROMPT_NO_INPUT_FORMAT

'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n{response}\n\n### End'

In [None]:
def proccess_total_dataset(row):
  response = "\n".join([f"You: {text}" if i % 2 == 0 else f"Person: {text}" for i, text in enumerate(row['dialog'])])
  prompt = PROMPT_NO_INPUT_FORMAT.format(instruction=row['instruction'], response=response)
  return {
      'response': response,
      'text': prompt
  }

In [None]:
total_dataset = total_dataset.map(proccess_total_dataset)
total_dataset

Map:   0%|          | 0/19325 [00:00<?, ? examples/s]

Map:   0%|          | 0/2049 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['dialog', 'instruction', 'response', 'text'],
        num_rows: 19325
    })
    validation: Dataset({
        features: ['dialog', 'instruction', 'response', 'text'],
        num_rows: 2049
    })
})

In [None]:
def tokenization(batch) -> dict:
    return tokenizer(
        batch["text"],
        max_length=MODEL_MAX_LENGTH,
        truncation=True,
    )

In [None]:
tokenized_dataset = total_dataset.map(
    tokenization,
    batched=True
    # remove_columns=["instruction", "context", "response", "text", "category"],
)
tokenized_dataset = tokenized_dataset.filter(lambda rec: len(rec["input_ids"]) < MODEL_MAX_LENGTH)
tokenized_dataset

Map:   0%|          | 0/19325 [00:00<?, ? examples/s]

Map:   0%|          | 0/2049 [00:00<?, ? examples/s]

Filter:   0%|          | 0/19325 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2049 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['dialog', 'instruction', 'response', 'text', 'input_ids', 'attention_mask'],
        num_rows: 19301
    })
    validation: Dataset({
        features: ['dialog', 'instruction', 'response', 'text', 'input_ids', 'attention_mask'],
        num_rows: 2047
    })
})

In [None]:
tokenized_dataset = tokenized_dataset.shuffle(seed=SEED)

## Training

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, # can be 8 with llama
    lora_alpha=32, # can be 16 with llama
    # target_modules=["q_proj", "v_proj"],
    target_modules=['query_key_value'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 8388608 || all params: 6864445440 || trainable%: 0.1222037245881293


In [None]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTNeoXForCausalLM(
      (gpt_neox): GPTNeoXModel(
        (embed_in): Embedding(50280, 4096)
        (layers): ModuleList(
          (0-31): 32 x GPTNeoXLayer(
            (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
            (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
            (attention): GPTNeoXAttention(
              (rotary_emb): RotaryEmbedding()
              (query_key_value): Linear(
                in_features=4096, out_features=12288, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=12288, bias=False)
                )


In [41]:
local_output_dir = 'outputs'

train_args = TrainingArguments(
    per_device_train_batch_size=8, # can be 4 with llama
    per_device_eval_batch_size=8, # can be 4 with llama
    gradient_accumulation_steps=4,
    warmup_steps=20,
    # max_steps=200,
    optim="adamw_torch",
    learning_rate=3e-5, # many possible values here from 1e-5 to 2e-4
    # save_strategy="steps",
    fp16=True,
    # bf16=True,  # a100 required
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=400,
    logging_strategy="steps",
    logging_steps=10,
    logging_dir=f"{local_output_dir}/runs",
    report_to="tensorboard",
    output_dir=local_output_dir
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    args=train_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False  # silence the warnings. need to be re-enabled on inference
trainer.train()

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
50,1.9491,1.781157
100,1.3296,1.316318
150,1.2843,1.275628
200,1.2447,1.257212
250,1.2258,1.246625
300,1.217,1.239751


Step,Training Loss,Validation Loss
50,1.9491,1.781157
100,1.3296,1.316318
150,1.2843,1.275628
200,1.2447,1.257212
250,1.2258,1.246625
300,1.217,1.239751
350,1.2215,1.232536
400,1.2305,1.229084
450,1.2318,1.225028
500,1.1977,1.222749


In [42]:
save_name = "dolly-v2-7b-lora-emphatic-dd"

In [43]:
model.config.use_cache = True
model.save_pretrained(f"{local_output_dir}/{save_name}")

In [44]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub(f"hivaze/{save_name}", use_auth_token=True)

In [47]:
!tar -czvf outputs/runs.tar.gz outputs/runs

outputs/runs/
outputs/runs/events.out.tfevents.1687063298.aaac4c267310.12237.0
outputs/runs/1687063298.0496461/
outputs/runs/1687063298.0496461/events.out.tfevents.1687063298.aaac4c267310.12237.1


In [48]:
!tar -czvf outputs/dolly-v2-7b-lora-emphatic-dd.tar.gz outputs/dolly-v2-7b-lora-emphatic-dd

outputs/dolly-v2-7b-lora-emphatic-dd/
outputs/dolly-v2-7b-lora-emphatic-dd/adapter_config.json
outputs/dolly-v2-7b-lora-emphatic-dd/adapter_model.bin
