In [None]:
!pip install -U transformers peft datasets

In [3]:
import numpy as np
import pandas as pd

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset, load_dataset, concatenate_datasets, DatasetDict

import matplotlib.pyplot as plt
from collections import Counter
from functools import partial

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
SEED = 42
MODEL_MAX_LENGTH = 1024

## Datasets

In [6]:
system_prompt = "You are a kind and empathetic interlocutor. You are talking to a person. Below is an instruction that describes a task. Write a response that appropriately completes the request."
system_prompt

'You are a kind and empathetic interlocutor. You are talking to a person. Below is an instruction that describes a task. Write a response that appropriately completes the request.'

In [7]:
instruction_prompt = "You try to chit-chat. Complete a phrase, acting like an interlocutor."
instruction_prompt

'You try to chit-chat. Complete a phrase, acting like an interlocutor.'

### empathetic_dialogues

In [8]:
empathetic_dataset = load_dataset("empathetic_dialogues")
empathetic_dataset

Downloading builder script:   0%|          | 0.00/4.51k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.15k [00:00<?, ?B/s]

Downloading and preparing dataset empathetic_dialogues/default to /root/.cache/huggingface/datasets/empathetic_dialogues/default/0.1.0/09bbeed3882a67db98c73952fb3c1c9a85af83dc78f81454c2454382fd03f6cf...


Downloading data:   0%|          | 0.00/28.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/76673 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/12030 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10943 [00:00<?, ? examples/s]

Dataset empathetic_dialogues downloaded and prepared to /root/.cache/huggingface/datasets/empathetic_dialogues/default/0.1.0/09bbeed3882a67db98c73952fb3c1c9a85af83dc78f81454c2454382fd03f6cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 76673
    })
    validation: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 12030
    })
    test: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 10943
    })
})

In [9]:
empathetic_dataset = empathetic_dataset.map(lambda row: {'utterance': row['utterance'].replace('_comma_', ',')})
empathetic_dataset

Map:   0%|          | 0/76673 [00:00<?, ? examples/s]

Map:   0%|          | 0/12030 [00:00<?, ? examples/s]

Map:   0%|          | 0/10943 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 76673
    })
    validation: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 12030
    })
    test: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 10943
    })
})

In [10]:
Counter(empathetic_dataset['train']['context']).most_common(10)

[('surprised', 3956),
 ('excited', 2935),
 ('angry', 2740),
 ('proud', 2719),
 ('annoyed', 2642),
 ('sad', 2634),
 ('afraid', 2511),
 ('lonely', 2503),
 ('terrified', 2488),
 ('grateful', 2487)]

In [11]:
Counter(empathetic_dataset['train']['tags']).most_common(10)

[('', 75975),
 ('<UNIGRAM>', 315),
 ('<HI>', 155),
 ('<POLITICAL>', 126),
 ('<UNIGRAM> <NUMERAL>', 76),
 ('<IRREGULAR_COLON_FORMAT>', 11),
 ('<HI> <UNIGRAM>', 10),
 ('I couldn\'t sleep with the lights on after watching "The Sixth Sense" the first time.  When I closed my eyes I would see the poisoned girl staring at me angrily with vomit running down her mouth_comma_ with the abused lady behind her yelling..."YOU CAN\'T HURT ME ANYMORE NEDDY!!!"".',
  1),
 ('I knew i shouldnt have trusted my brother with my dog!,5|5|4_2|3|4,\nhit:5806_conv:11613,2,trusting,I knew i shouldnt have trusted my brother with my dog!',
  1),
 ('You cannot believe_comma_ what I just did. I wanted to surprise my friend at his new house_comma_ but I was at the wrong his. So instead_comma_ I surprised a complete stranger. ',
  1)]

In [12]:
empathetic_dataset = empathetic_dataset.filter(lambda x: x['context'] in ['joyful', 'sad', 'lonely', 'embarrassed', 'surprised', 'excited', 'sentimental', 'faithful', 'proud', 'trusting', 'hopeful', 'confident', 'grateful'])
empathetic_dataset

Filter:   0%|          | 0/76673 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12030 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10943 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 32126
    })
    validation: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 5046
    })
    test: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 4751
    })
})

In [13]:
for ds_split in empathetic_dataset.keys():
  df = empathetic_dataset[ds_split].to_pandas()
  empathetic_dataset[ds_split] = Dataset.from_pandas(pd.DataFrame(df.groupby('conv_id')['utterance'].apply(list)))

empathetic_dataset

DatasetDict({
    train: Dataset({
        features: ['utterance', 'conv_id'],
        num_rows: 7490
    })
    validation: Dataset({
        features: ['utterance', 'conv_id'],
        num_rows: 1160
    })
    test: Dataset({
        features: ['utterance', 'conv_id'],
        num_rows: 1098
    })
})

In [14]:
empathetic_dataset = empathetic_dataset.rename_column('utterance', 'dialog')
empathetic_dataset = empathetic_dataset.remove_columns(['conv_id'])
empathetic_dataset

DatasetDict({
    train: Dataset({
        features: ['dialog'],
        num_rows: 7490
    })
    validation: Dataset({
        features: ['dialog'],
        num_rows: 1160
    })
    test: Dataset({
        features: ['dialog'],
        num_rows: 1098
    })
})

### daily_dialog

In [15]:
dd_dataset = load_dataset("daily_dialog")
dd_dataset

Downloading builder script:   0%|          | 0.00/4.85k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.49k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

Downloading and preparing dataset daily_dialog/default to /root/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd...


Downloading data:   0%|          | 0.00/4.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11118 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset daily_dialog downloaded and prepared to /root/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 11118
    })
    validation: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 1000
    })
})

In [16]:
dd_dataset = dd_dataset.map(lambda x: {'emotion_score': np.isin(x['emotion'], [0, 4, 6]).mean()}) # neutral, positive, surprise
dd_dataset

Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['dialog', 'act', 'emotion', 'emotion_score'],
        num_rows: 11118
    })
    validation: Dataset({
        features: ['dialog', 'act', 'emotion', 'emotion_score'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['dialog', 'act', 'emotion', 'emotion_score'],
        num_rows: 1000
    })
})

In [17]:
dd_dataset = dd_dataset.filter(lambda x: x['emotion_score'] == 1.0)
dd_dataset

Filter:   0%|          | 0/11118 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['dialog', 'act', 'emotion', 'emotion_score'],
        num_rows: 9874
    })
    validation: Dataset({
        features: ['dialog', 'act', 'emotion', 'emotion_score'],
        num_rows: 889
    })
    test: Dataset({
        features: ['dialog', 'act', 'emotion', 'emotion_score'],
        num_rows: 863
    })
})

In [18]:
dd_dataset = dd_dataset.remove_columns(['act', 'emotion', 'emotion_score'])
dd_dataset

DatasetDict({
    train: Dataset({
        features: ['dialog'],
        num_rows: 9874
    })
    validation: Dataset({
        features: ['dialog'],
        num_rows: 889
    })
    test: Dataset({
        features: ['dialog'],
        num_rows: 863
    })
})

### Merging

In [19]:
total_dataset = DatasetDict()
for ds_split in ['train', 'validation', 'test']:
  total_dataset[ds_split] = concatenate_datasets([dd_dataset[ds_split], empathetic_dataset[ds_split]])

total_dataset

DatasetDict({
    train: Dataset({
        features: ['dialog'],
        num_rows: 17364
    })
    validation: Dataset({
        features: ['dialog'],
        num_rows: 2049
    })
    test: Dataset({
        features: ['dialog'],
        num_rows: 1961
    })
})

In [20]:
total_dataset['train'] = concatenate_datasets([total_dataset['train'], total_dataset['test']])
del total_dataset['test']
total_dataset

DatasetDict({
    train: Dataset({
        features: ['dialog'],
        num_rows: 19325
    })
    validation: Dataset({
        features: ['dialog'],
        num_rows: 2049
    })
})

In [21]:
INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
# END_KEY = "### End"
RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"

PROMPT_NO_INPUT_FORMAT = """{intro}

{instruction_key}
{instruction}

{response_key}
{response}
""".format(
    intro=system_prompt,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
    response="{response}",
    # end_key=END_KEY,
)

PROMPT_NO_INPUT_FORMAT

'You are a kind and empathetic interlocutor. You are talking to a person. Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n{response}\n'

In [22]:
def proccess_total_dataset(row):
  response = "\n".join([f"Person: {text.strip()}" if i % 2 == 0 else f"You: {text.strip()}" for i, text in enumerate(row['dialog'])])  # maybe END_KEY here after You:...
  prompt = PROMPT_NO_INPUT_FORMAT.format(instruction=instruction_prompt, response=response)
  return {
      # 'response': response,
      'text': prompt
  }

In [23]:
total_dataset = total_dataset.map(proccess_total_dataset)
total_dataset

Map:   0%|          | 0/19325 [00:00<?, ? examples/s]

Map:   0%|          | 0/2049 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['dialog', 'text'],
        num_rows: 19325
    })
    validation: Dataset({
        features: ['dialog', 'text'],
        num_rows: 2049
    })
})

In [24]:
total_dataset.push_to_hub('hivaze/emphatical_daily_dialogues')



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/472 [00:00<?, ?B/s]