In [1]:
import csv
import random

from datasets import Dataset, DatasetDict

In [2]:
data_dir = f"../cache_data/IBMDebaterEvidenceSentences"

def load_data(split):
    file_name = f"{data_dir}/{split}.csv"
    
    with open(file_name, 'r') as file:
        reader = csv.reader(file)
        column_names = next(reader)
        data = [dict(zip(column_names, row)) for row in reader]

    return data

In [3]:
data = dict(
    train=load_data("train"),
    test=load_data("test"),
)

dataset = DatasetDict({
    split: Dataset.from_list(rows)
    for split, rows in data.items()
})

In [4]:
dataset.push_to_hub("diwank/IBMDebaterEvidenceSentences", private=True)

Pushing split train to the Hub.
Resuming upload of the dataset shards.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.
Resuming upload of the dataset shards.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/693 [00:00<?, ?B/s]

In [5]:
make_chatml = lambda name, role, content: dict(
    name=name, role=role, content=content,
)

system = lambda name, content: make_chatml(
    role="system",
    name=name,
    content=content,
)

situation = lambda content: system(name="situation", content=content)
thought = lambda content: system(name="thought", content=content)
information = lambda content: system(name="information", content=content)
me = lambda content, name=None: make_chatml(
    role="assistant",
    content=content,
    name=name,
)

person = lambda content, name=None: make_chatml(
    role="user",
    content=content,
    name=name,
)

def to_chatml(row):
    topic = row["topic"]
    concept = row["the concept of the topic"]
    candidate = row["candidate"]
    label = int(row["label"])  # 0 or 1

    choice_keys = ["A", "B"]
    choices = ["This is a valid evidence for this debate, either for or against the topic", "This is not relevant for this debate topic"]

    # Flip randomly so the training is not skewed to position
    if random.randint(0, 1):
        choices = list(reversed(choices))
        label = 0 if label else 1

    chatml = [
        situation(
            f"Person is debating a concept in {concept} with AI Assistant."
            f' They are discussing whether "{topic}" or not.'
        ),
        person(
            f"In light of the topic '{topic}', what do you think of the following statement?"
            f'\nStatement: "{candidate}"'
        ),
        thought(
            f'Is the statement "{candidate}" relevant to this topic?'
            + f"\nChoices:\n"
            + '\n'.join(f"{key}) {choice}" for key, choice in zip(choice_keys, choices))
            + '\n'
            + f"\nAnswer: {choice_keys[label]}"
            + f"\n{choices[label]}"
        ),
    ]

    return dict(chatml=chatml)

In [6]:
dataset = dataset.map(to_chatml).remove_columns(list(set(dataset["train"].column_names) - {"chatml"}))

Map:   0%|          | 0/4065 [00:00<?, ? examples/s]

Map:   0%|          | 0/1718 [00:00<?, ? examples/s]

In [7]:
dataset.push_to_hub("diwank/samantha-debate-evidence", private=True)

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/535 [00:00<?, ?B/s]