In [1]:
from datasets import load_dataset

dataset = load_dataset("stacked-summaries/stacked-samsum-1024")

In [2]:
make_chatml = lambda name, role, content: dict(
    name=name, role=role, content=content,
)

system = lambda name, content: make_chatml(
    role="system",
    name=name,
    content=content,
)

situation = lambda content: system(name="situation", content=content)
thought = lambda content: system(name="thought", content=content)
information = lambda content: system(name="information", content=content)
me = lambda content, name=None: make_chatml(
    role="assistant",
    content=content,
    name=name,
)

person = lambda content, name=None: make_chatml(
    role="user",
    content=content,
    name=name,
)

In [3]:
dataset = dataset["train"]
filtered_dataset = dataset.filter(lambda row: row["is_stacked"] and "<" not in row["dialogue"])
len(filtered_dataset), len(dataset)

(6652, 58884)

In [4]:
from cleantext import clean

clean_text = lambda text: clean(text,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=False,                     # lowercase text
    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
    no_urls=False,                  # replace all URLs with a special token
    no_emails=False,                # replace all email addresses with a special token
    no_phone_numbers=False,         # replace all phone numbers with a special token
    no_numbers=False,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=False,      # replace all currency symbols with a special token
    no_punct=False,                 # remove punctuations
    lang="en"                       # set to 'de' for German special handling
)


In [5]:
import re

def to_chatml(row):
    dialogue = clean_text(row["dialogue"])
    summary = row["summary"]

    concepts = [
        concept.strip()
        for concept
        in summary.split("[NEXT_CONCEPT]")
    ]

    lines = re.split(r"[\r\n]{1,2}", dialogue)

    try:
        # Turn chat into chatml
        chat = [
            re.match(r"([a-zA-Z \.\-']+):\s*(.*)$", line.strip()).groups()
            for line in lines
        ]
    
    except:
        import traceback; print(traceback.format_exc())
        return dict(chatml=[])

    chat = [
        person(utterance, name=name)
        for name, utterance in chat
        if name and utterance
    ]
    
    persons = list(set([g["name"] for g in chat]))
    delim = "\n\n- "

    chatml = [
        situation(
            f"The following people are talking: {', '.join(persons[:-1])} and {persons[-1]}. An AI is analyzing the conversation."
            " At the end of the conversation, the AI will note down the main takeaways from the conversation."
        ),
        *chat,
        thought(f"Key takeaways:{delim}{delim.join(concepts)}"),
    ]

    return dict(chatml=chatml)

filtered_dataset = filtered_dataset.map(to_chatml).filter(lambda row: len(row["chatml"]) > 0).remove_columns(list(set(filtered_dataset.column_names) - {"chatml"}))

In [6]:
len(filtered_dataset), filtered_dataset[0]["chatml"]

(6622,
 [{'content': 'The following people are talking: Annie, David, Bryan, Jordyn, Adrianne, Peter, Lucas, Alex, Stuart, Frank, Mollie, Kev, Kate, James, Mike, Shirley, Mohamed, Kaz and Mary. An AI is analyzing the conversation. At the end of the conversation, the AI will note down the main takeaways from the conversation.',
   'name': 'situation',
   'role': 'system'},
  {'content': 'Are you there yet?', 'name': 'Bryan', 'role': 'user'},
  {'content': '5 more mins!', 'name': 'Kev', 'role': 'user'},
  {'content': 'Same!', 'name': 'Stuart', 'role': 'user'},
  {'content': 'Im on my way.', 'name': 'Bryan', 'role': 'user'},
  {'content': 'see you soon!', 'name': 'Bryan', 'role': 'user'},
  {'content': 'Miiikeeeee', 'name': 'Annie', 'role': 'user'},
  {'content': "let's meet next week", 'name': 'Annie', 'role': 'user'},
  {'content': "let's play some games", 'name': 'Annie', 'role': 'user'},
  {'content': 'wooohoooo', 'name': 'Annie', 'role': 'user'},
  {'content': 'haha hey Annie', 'name

In [8]:
filtered_dataset.push_to_hub("diwank/stacked-samsum-chatml", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/578 [00:00<?, ?B/s]