**Description**: Processes the [Craigslist
Bargains](https://huggingface.co/datasets/craigslist_bargains) dataset into a clean
classification task.

**Estimated runtime**: 10 sec.

In [1]:
from __future__ import annotations
from typing import Sequence

import datasets
import pandas as pd

# Load training data

In [2]:
_df_raw_tr = pd.DataFrame(datasets.load_dataset("craigslist_bargains", split="train"))

In [3]:
len(_df_raw_tr)

5247

The "text" to classify is gonna be the Buyer-Seller dialogue. Need to process that into
something an LLM would better understand.

In [4]:
_df_raw_tr["utterance"][0]  # see those last two empty strings. gonna drop em

['Hi, not sure if the charger would work for my car. Can you sell it to me for $5?',
 'It will work, i have never seen a car without a cigarette lighter port.\\',
 "Still, can I buy it for $5? I'm on a tight budge",
 'I think the lowest I would want to go is 8. ',
 "How about $6 and I pick it up myself? It'll save you shipping to me.",
 '7, and we have a deal.',
 'Eh, fine. $7.',
 '',
 '']

In [5]:
assert (_df_raw_tr["agent_turn"].apply(len) == _df_raw_tr["utterance"].apply(len)).all()

The possible choices for each dialogue are product categories:

In [6]:
_df_raw_tr["items"].apply(lambda item: item["Category"])

0               [phone, phone]
1                 [bike, bike]
2           [housing, housing]
3       [furniture, furniture]
4       [furniture, furniture]
                 ...          
5242                [car, car]
5243    [furniture, furniture]
5244              [bike, bike]
5245    [furniture, furniture]
5246        [housing, housing]
Name: items, Length: 5247, dtype: object

Not sure why they're duplicated. Are they ever different? Hope not.

In [7]:
assert (_df_raw_tr["items"].apply(lambda item: len(set(item["Category"]))) == 1).all()

In [8]:
CLASS_NAMES = sorted(set(_df_raw_tr["items"].apply(lambda item: item["Category"][0])))
CLASS_NAMES

['bike', 'car', 'electronics', 'furniture', 'housing', 'phone']

I'm gonna assume this is the complete list.

# Process training data

To ensure train and test data processing is equivalent, we'll apply the same function:
`process`.

In [9]:
def _as_dialogue(
    all_agent_turns: Sequence[Sequence[bool]], all_utterances: Sequence[Sequence[str]]
):
    if len(all_agent_turns) != len(all_utterances):
        raise ValueError("agent_turns and utterances must have the same length.")

    dialogues = []
    for agent_turns, utterances in zip(all_agent_turns, all_utterances):
        dialogue: list[str] = []
        for agent_turn, utterance in zip(agent_turns, utterances):
            if not utterance:
                # some utterances are empty for some reason. just gonna drop em
                continue
            prefix = "Buyer: " if not agent_turn else "Seller: "
            dialogue.append(prefix + utterance)
        dialogues.append("\n".join(dialogue))
    return dialogues


def process(craigslist_df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns a DataFrame with the canonical `"text"`, `"label"`, and `"prompt"` columns
    for the
    [CraigslistBargains dataset](https://huggingface.co/datasets/craigslist_bargains),
    assuming it was loaded via::

        craigslist_df = pd.DataFrame(
            datasets.load_dataset("craigslist_bargains", split=...)
        )
    """
    # Input checks
    if not (
        craigslist_df["agent_turn"].apply(len) == craigslist_df["utterance"].apply(len)
    ).all():
        raise ValueError("There's an agent_turn and utterance with different lengths.")
    if not (
        craigslist_df["items"].apply(lambda item: len(set(item["Category"]))) == 1
    ).all():
        raise ValueError("There's an item associated with multiple categories.")

    # hard-coded per dataset to ensure consistency across splits. it's possible that the
    # test split is missing some of these.
    df = pd.DataFrame(
        {
            "text": _as_dialogue(
                craigslist_df["agent_turn"], craigslist_df["utterance"]
            ),
            "class_name": [item["Category"][0] for item in craigslist_df["items"]],
        },
        index=craigslist_df.index,
    )
    assert len(df) == len(craigslist_df)
    df["label"] = [CLASS_NAMES.index(class_name) for class_name in df["class_name"]]
    return df[["text", "label", "class_name"]]

# Quick check

In [10]:
df_tr = process(craigslist_df=_df_raw_tr)

In [11]:
len(df_tr)

5247

In [12]:
print(df_tr["text"].iloc[0])

Buyer: Hi, not sure if the charger would work for my car. Can you sell it to me for $5?
Seller: It will work, i have never seen a car without a cigarette lighter port.\
Buyer: Still, can I buy it for $5? I'm on a tight budge
Seller: I think the lowest I would want to go is 8. 
Buyer: How about $6 and I pick it up myself? It'll save you shipping to me.
Seller: 7, and we have a deal.
Buyer: Eh, fine. $7.


In [13]:
print(df_tr[["label", "class_name"]].iloc[0])

label             5
class_name    phone
Name: 0, dtype: object


# Transform and save

In [14]:
splits = ["train", "validation", "test"]
for split in splits:
    _df_raw = pd.DataFrame(datasets.load_dataset("craigslist_bargains", split=split))
    process(_df_raw).to_csv(f"{split}.csv", index=False)