In [11]:
!#wget https://uwnlp.github.io/storycommonsense/data/storycommonsense_data.zip -O cache_data/storycommonsense_data.zip

In [12]:
!#unzip cache_data/storycommonsense_data.zip -d cache_data/storycommonsense_data

In [13]:
import json
from textwrap import dedent

from datasets import Dataset
from huggingface_hub import notebook_login
from tqdm.auto import tqdm
import yaml

In [14]:
with open('cache_data/storycommonsense_data/json_version/annotations.json') as f:
    data = json.load(f)

In [15]:
data["0008c800-82b6-43b3-8b53-5475ed1dac9b"]

{'lines': {'1': {'characters': {'My daughter': {'app': False,
     'emotion': {},
     'motiv': {}},
    'Nana': {'app': True,
     'emotion': {'ann0': {'plutchik': ['sadness:2', 'disgust:2', 'anger:2'],
       'text': ['confused']},
      'ann1': {'plutchik': ['surprise:2'], 'text': ['confused']}},
     'motiv': {'ann0': {'maslow': ['stability'],
       'reiss': ['order'],
       'text': ['to understand']},
      'ann1': {'maslow': ['none'],
       'reiss': [],
       'text': ['to check something']},
      'ann2': {'maslow': ['esteem'],
       'reiss': ['status'],
       'text': ['to be noticed']}}}},
   'text': 'Nana came into the room with a puzzled look on her face.'},
  '2': {'characters': {'My daughter': {'app': False,
     'emotion': {},
     'motiv': {}},
    'Nana': {'app': True,
     'emotion': {'ann0': {'plutchik': ['surprise:2'], 'text': ['confused']},
      'ann1': {'plutchik': ['surprise:2', 'anger:2'],
       'text': ['confused', 'frustrated']},
      'ann2': {'plutchik'

In [16]:
processed_data = [
    dict(
        title=record["title"],
        story="\n".join([
            line["text"] for line in record["lines"].values()
        ]),
        characters=set([
            char
            for line in record["lines"].values()
            for char in line["characters"].keys()
        ]),
        emotions={
            char: [
                {
                    "emotion": [
                        emotion_data
                        for emotion_data in char_data["emotion"].values()
                        if '3' in ''.join(emotion_data.get("plutchik", []))
                    ],
                    "line_no": line_no,
                }
                for line_no, line in record["lines"].items()
                for char_name, char_data in line["characters"].items()
                if char_name == char and char_data["emotion"]
            ]
            for line in record["lines"].values()
            for char in line["characters"].keys()
        },
    )
    for record in tqdm(data.values())
]

  0%|          | 0/14738 [00:00<?, ?it/s]

In [17]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
make_chatml = lambda name, role, content: dict(
    name=name, role=role, content=content,
)

system = lambda name, content: make_chatml(
    role="system",
    name=name,
    content=content,
)

situation = lambda content: system(name="situation", content=content)
thought = lambda content: system(name="thought", content=content)
information = lambda content: system(name="information", content=content)
me = lambda content, name=None: make_chatml(
    role="assistant",
    content=content,
    name=name,
)

person = lambda content, name=None: make_chatml(
    role="user",
    content=content,
    name=name,
)


In [44]:

def max_emotion(emotion_list):
    fn = lambda e: sum([int(p.split(":")[-1]) for p in e["plutchik"]])
    return max(emotion_list, key=fn)

def parse_emotions(emotion_list):
    strengths = ["weak", "moderate", "strong"]
    parsed = {
        emotion: strengths[int(strength_index)-1]
        for emotion, strength_index in [
            e.split(":") for e in emotion_list
        ]
    }

    return parsed

def pick_strongest(emotion_data):
    strongest_emotions = {
        char_name: [
            dict(
                emotion=parse_emotions(max_emotion(line["emotion"])["plutchik"]),
                text=max_emotion(line["emotion"])["text"],
                line_no=line["line_no"],
            )
            for line in emotions
            if len(line["emotion"])
        ]
        for char_name, emotions in emotion_data.items()
    }

    strongest_emotions = {
        k: v
        for k, v in strongest_emotions.items()
        if len(v) > 0
    }

    return strongest_emotions

pick_strongest(processed_data[0]["emotions"])
# processed_data[0]["emotions"]

{'My daughter': [{'emotion': {'joy': 'strong'},
   'text': ['happy'],
   'line_no': '4'},
  {'emotion': {'joy': 'strong',
    'trust': 'strong',
    'surprise': 'strong',
    'anticipation': 'strong'},
   'text': ['fun'],
   'line_no': '5'}],
 'Nana': [{'emotion': {'joy': 'strong', 'surprise': 'moderate'},
   'text': ['amused'],
   'line_no': '3'},
  {'emotion': {'joy': 'strong', 'trust': 'moderate', 'surprise': 'strong'},
   'text': ['amused'],
   'line_no': '4'},
  {'emotion': {'joy': 'strong', 'trust': 'moderate', 'surprise': 'strong'},
   'text': ['playful'],
   'line_no': '5'}]}

In [50]:
def merge(dicts):
    merged = {}
    for d in dicts:
        merged = {**merged, **d}

    return merged

def to_chatml(row):
    title = row["title"]
    story = row["story"]
    characters = list(row["characters"])
    emotions = row["emotions"]

    strongest_emotions = pick_strongest(emotions)

    if (
        len(strongest_emotions.items()) == 0
    ):
        return dict(chatml=[])

    overall_emotions = [
        dict(
            character=char_name,
            emotions=[
                dict(emotion=emotion, strength=strength)
                for emotion, strength in merge([line["emotion"] for line in lines]).items()
            ]
        )
        for char_name, lines in strongest_emotions.items()
    ]


    newline = "\n"

    situation_content = dedent(f"""\
    User and a helpful AI assistant are talking about the short story "{title}".
    The AI assistant tries to understand the story and answer questions about it.
    """).strip()
    
    characters_formatted = (
        ", ".join(characters[:-1]) + f" and {characters[-1]}"
        if len(characters) > 1
        else characters[0]
    )

    user_content = f"""\
Read the following story and then analyze the emotions of the characters. The story is about {characters_formatted}.

Instructions:
- Think step by step and write down the emotions of the characters in the story as a valid yaml list for every character.
- Each item in the list should specify the "character" and their "emotions" as a list of all the emotions of that character.
- "emotions" is a list of "emotion" name from the Plutchik Wheel of Emotions and its "strength" as one of "weak", "moderate" or "strong".
    """.strip()

    info_content = f"""\
# {title}

{story}
    """.strip()
    
    steps = [
        {"line_no": info["line_no"], "emotion": info["emotion"], "character": char_name}
        for char_name, emotion_by_line in strongest_emotions.items()       
        for info in emotion_by_line
    ]

    steps = [
        f"In line {step['line_no']}, {step['character'].replace('My', 'The')} is feeling {', '.join([f'{e.upper()} ({s}ly)' for e, s in step['emotion'].items()])}."
        for step in steps
    ]

    thought_content = dedent(f"""\
Thinking step by step:

- {f"{newline}- ".join(steps)}
    """).strip()
    
    chatml = [
        situation(content=situation_content),
        person(content=user_content),
        information(content=info_content),
        thought(content=thought_content),
        me(content=yaml.dump(overall_emotions))
    ]

    return dict(chatml=chatml)

to_chatml(processed_data[0])

{'chatml': [{'name': 'situation',
   'role': 'system',
   'content': 'User and a helpful AI assistant are talking about the short story "Sock Chase".\nThe AI assistant tries to understand the story and answer questions about it.'},
  {'name': None,
   'role': 'user',
   'content': 'Read the following story and then analyze the emotions of the characters. The story is about My daughter and Nana.\n\nInstructions:\n- Think step by step and write down the emotions of the characters in the story as a valid yaml list for every character.\n- Each item in the list should specify the "character" and their "emotions" as a list of all the emotions of that character.\n- "emotions" is a list of "emotion" name from the Plutchik Wheel of Emotions and its "strength" as one of "weak", "moderate" or "strong".'},
  {'name': 'information',
   'role': 'system',
   'content': '# Sock Chase\n\nNana came into the room with a puzzled look on her face.\nShe held up an orange sock and a blue one.\nMy daughter ju

In [51]:
chatml_data = [
    row for row in map(to_chatml, tqdm(processed_data))
    if len(row["chatml"]) > 0
]

  0%|          | 0/14738 [00:00<?, ?it/s]

In [52]:
len(chatml_data)

4827

In [53]:
dataset = Dataset.from_list(chatml_data)

In [54]:
dataset.push_to_hub("diwank/storycommonsense-chatml")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/562 [00:00<?, ?B/s]

In [55]:
import random
from pprint import pprint

pprint(dataset[random.randint(0, len(dataset))]["chatml"])

[{'content': 'User and a helpful AI assistant are talking about the short '
             'story "Haircut at home".\n'
             'The AI assistant tries to understand the story and answer '
             'questions about it.',
  'name': 'situation',
  'role': 'system'},
 {'content': 'Read the following story and then analyze the emotions of the '
             'characters. The story is about Cayla.\n'
             '\n'
             'Instructions:\n'
             '- Think step by step and write down the emotions of the '
             'characters in the story as a valid yaml list for every '
             'character.\n'
             '- Each item in the list should specify the "character" and their '
             '"emotions" as a list of all the emotions of that character.\n'
             '- "emotions" is a list of "emotion" name from the Plutchik Wheel '
             'of Emotions and its "strength" as one of "weak", "moderate" or '
             '"strong".',
  'name': None,
  'role': 'user