In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import uuid
import pandas as pd
from datasets import load_dataset
from pandarallel import pandarallel
from pathlib import Path
from tqdm.notebook import tqdm

pandarallel.initialize(progress_bar=True, nb_workers=8)
tqdm.pandas()

DATA_DIR = Path('../data')
DATA_DIR.mkdir(exist_ok=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [3]:
# load oasst2-ja
oasst2_ja = load_dataset('kunishou/oasst2-135k-ja')
df_ja = oasst2_ja['train'].to_pandas()
df_ja = df_ja.drop_duplicates(subset='message_id')
df_ja.head(1)

Unnamed: 0,role,use_deepl,index,message_tree_id,parent_id,text,lang,message_id,text_ja
0,prompter,1,0,002c4715-b026-48d1-8d19-3f724a9fc1e8,,Dame los pasos de las cosas que debería de apr...,es,002c4715-b026-48d1-8d19-3f724a9fc1e8,ゲーム開発者になるために学ぶべきことのステップを教えてください。


In [4]:
# load original oasst2
print('Loading oasst2..')
oasst2 = load_dataset('OpenAssistant/oasst2', split='train+validation')
df = oasst2.to_pandas()


# Merge text_ja into original oasst2 df
print('Merging..')
df = df.merge(df_ja[['message_id', 'text_ja']])

Loading oasst2..
Merging..


In [5]:
df.shape, df_ja.shape

((135174, 19), (135174, 9))

In [6]:
# Pre-filtering

# English and Japanese only, reviewed multiple times, and drop entries without labels
df = df.query(r'lang in ["en", "ja"] & review_count > 1 & labels.notna()')

In [7]:
df_filtered = df.copy()

# Converting labels
df_filtered.labels = df_filtered.labels.parallel_apply(lambda x: pd.DataFrame(x).set_index('name').transpose().to_dict())


# df_filtered.labels.str['lang_mismatch'].str['value'].plot.hist(bins=100) # == 0.0
# df_filtered.labels.str['spam'].str['value'].plot.hist(bins=100) # == 0.0
# df_filtered.labels.str['fails_task'].str['value'].plot.hist(bins=100) # == 0.0
# df_filtered.labels.str['pii'].str['value'].plot.hist(bins=100) # == 0.0  # personally identifiable information
# df_filtered.labels.str['not_appropriate'].str['value'].plot.hist(bins=100) # == 0.0
# df_filtered.labels.str['hate_speech'].str['value'].plot.hist(bins=100) # == 0.0
# df_filtered.labels.str['sexual_content'].str['value'].plot.hist(bins=100) # no rules
# df_filtered.labels.str['quality'].str['value'].plot.hist(bins=100) # >= 0.7
# df_filtered.labels.str['toxicity'].str['value'].plot.hist(bins=100) # < 0.1
# df_filtered.labels.str['humor'].str['value'].plot.hist(bins=100) # no rules
# df_filtered.labels.str['helpfulness'].str['value'].plot.hist(bins=100) # >= 0.7
# df_filtered.labels.str['creativity'].str['value'].plot.hist(bins=100) # no rules?
# df_filtered.labels.str['violence'].str['value'].plot.hist(bins=100) # == 0.0

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=8029), Label(value='0 / 8029'))), …

In [8]:
df_filtered.labels.head(10).apply(lambda x: {k: v['value'] for k,v in x.items()}).apply(pd.Series).style.bar()

  end = (x - left) / (right - left)


Unnamed: 0,spam,lang_mismatch,pii,not_appropriate,hate_speech,sexual_content,quality,toxicity,humor,creativity,violence,fails_task,helpfulness
37,0.0,0.333333,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.166667,0.0,,
38,0.0,0.0,0.0,0.0,0.0,0.0,0.833333,0.125,0.0,0.0,0.0,0.0,0.75
39,0.0,0.0,0.0,0.0,0.0,0.0,0.416667,0.0,0.0,0.0,0.0,,
40,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.166667,0.083333,0.083333,0.0,0.0,0.75
41,0.0,0.0,0.0,0.0,0.0,0.0,0.916667,0.0,0.083333,0.5,0.083333,0.0,0.916667
42,0.0,0.0,0.0,0.0,0.0,0.0,0.916667,0.0,0.0,0.25,0.0,0.0,0.875
43,0.0,0.0,0.0,0.0,0.0,0.0,0.833333,0.125,0.0,0.5,0.0,,
44,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.083333,0.333333,0.166667,0.0,1.0,0.25
45,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.0,0.0,0.0,0.0,,
46,0.333333,0.0,0.0,0.0,0.0,0.0,0.833333,0.0,0.375,0.375,0.0,0.0,0.875


In [9]:
conditions = [
    ('lang_mismatch', '==0.0'),
    ('spam', '==0.0'),
    ('fails_task', '==0.0'),
    ('pii', '==0.0'),
    ('not_appropriate', '==0.0'),
    ('hate_speech', '==0.0'),
    # ('sexual_content', ''),
    ('quality', '>=0.7'),
    ('toxicity', '<0.1'),
    # ('humor', ''),
    ('helpfulness', '>=0.7'),
    # ('creativity', ''),
    ('violence', '==0.0'),
]

conds = [f'labels.str["{k}"].str["value"]{v}' for k, v in conditions]
df_filtered = df_filtered.query(' & '.join(conds))
print(df_filtered.shape)

(7551, 19)


In [10]:
df_filtered.shape

(7551, 19)

In [11]:
import copy
import os
from collections import defaultdict
from pathlib import Path
from tqdm import tqdm
from typing import Callable, Dict, List, Literal, Union
from typing_extensions import TypeAlias

import pandas as pd
from datasets import (
    Dataset,
    load_dataset,
)
Thread: TypeAlias = List[Dict[str, str]]
Split: TypeAlias = Literal['train', 'eval', 'test', 'full']


def to_threads(df: pd.DataFrame) -> pd.DataFrame:
    nodes = defaultdict(list)
    progbar = tqdm(
        df.to_dict(orient='records'),
        dynamic_ncols=True,
        desc='Building conversation tree..'
    )
    for data in progbar:
        if data['parent_id']:
            nodes[data['parent_id']].append(data['message_id'])

    print('Converting df to dict..')
    data_dict = df.set_index('message_id').transpose().to_dict()

    def follow(thread: Thread, current_id: str) -> List[Thread]:
        # Given a thread and a current_id, return the thread that follows the current_id
        thread = thread + [{
            'message_id': current_id,
            **data_dict[current_id],
        }]

        if current_id in nodes:
            new_thread = []
            for child_id in nodes[current_id]:
                new_thread += follow(thread, child_id)
            return new_thread
        else:
            return [thread]

    def get_threads_from_root(root_id: str) -> List[Thread]:
        # Given a root_id, return all the threads in the tree
        all_threads = []
        thread = [{
            'message_id': root_id,
            **data_dict[root_id],
        }]
        for child_id in nodes[root_id]:
            all_threads += follow(thread, child_id)
        return all_threads

    df_filtered = df.copy()
    df_filtered = df[df.lang.isin(['en', 'ja'])]
    df_filtered = df_filtered[df_filtered.parent_id.isna()]


    tqdm.pandas(desc='Gathering threads..', dynamic_ncols=True, disable=False)
    ser_thread = df_filtered.message_id.progress_apply(get_threads_from_root)
    ser_thread = ser_thread.explode().reset_index(drop=True)


    # def cut_last_prompter_message(thread):
    #     if thread[-1]['role'] == 'prompter':
    #         return thread[:-1]
    #     return thread
    # tqdm.pandas(desc='Cutting last prompter message..', dynamic_ncols=True, disable=False)
    # ser_thread = ser_thread.progress_apply(cut_last_prompter_message)

    df_thread = ser_thread.to_frame(name='thread')
    df_thread['thread_id'] = df_thread.thread.apply(lambda x: str(uuid.uuid4()))

    df_thread['_full_message_id'] = df_thread.thread.apply(lambda xs: [x['message_id'] for x in xs]).str.join('_')
    df_thread = df_thread.sort_values(by='_full_message_id').reset_index(drop=True)
    df_thread = df_thread.drop(columns='_full_message_id')

    return df_thread

df_thread = to_threads(df)

Building conversation tree..: 100%|██████████| 64232/64232 [00:00<00:00, 2862485.36it/s]


Converting df to dict..


Gathering threads..: 100%|██████████| 5486/5486 [00:00<00:00, 27790.97it/s]


In [12]:
df_thread.thread.str[0].str['message_id'].head(10).tolist()

['001199c7-7135-4e64-bee9-48ea862243b4',
 '001199c7-7135-4e64-bee9-48ea862243b4',
 '001199c7-7135-4e64-bee9-48ea862243b4',
 '001199c7-7135-4e64-bee9-48ea862243b4',
 '001199c7-7135-4e64-bee9-48ea862243b4',
 '001199c7-7135-4e64-bee9-48ea862243b4',
 '001199c7-7135-4e64-bee9-48ea862243b4',
 '001199c7-7135-4e64-bee9-48ea862243b4',
 '001199c7-7135-4e64-bee9-48ea862243b4',
 '001199c7-7135-4e64-bee9-48ea862243b4']

In [13]:
# df_thread.full_message_id.sort_values().head(20).tolist()

['001199c7-7135-4e64-bee9-48ea862243b4_27376dd3-3d2e-42ae-8d17-d05265e4c587_170aa60d-d2f0-4c78-943a-2d6b49f8bd66',
 '001199c7-7135-4e64-bee9-48ea862243b4_27376dd3-3d2e-42ae-8d17-d05265e4c587_36b0d881-4008-4088-b405-b5ad43b66717',
 '001199c7-7135-4e64-bee9-48ea862243b4_27376dd3-3d2e-42ae-8d17-d05265e4c587_b6a71f0b-6703-4c33-9f9d-637003337505',
 '001199c7-7135-4e64-bee9-48ea862243b4_27376dd3-3d2e-42ae-8d17-d05265e4c587_d0cd3c86-178f-4704-aa41-eeeef5c62165',
 '001199c7-7135-4e64-bee9-48ea862243b4_27376dd3-3d2e-42ae-8d17-d05265e4c587_da6e7b94-edee-446c-940e-0080f8e6127a',
 '001199c7-7135-4e64-bee9-48ea862243b4_314ea243-638d-48d6-9f3d-8edf80f85ec2',
 '001199c7-7135-4e64-bee9-48ea862243b4_e97d0ddc-c4a5-49ce-a22f-ba3191d2fdbc_0cc74b00-7d81-47ee-8f40-3ed2a60e0d70',
 '001199c7-7135-4e64-bee9-48ea862243b4_f5eea094-f084-4375-9d64-ec3b5064203c_296a9bc7-32c3-46ad-b6ee-7d73bce89b77',
 '001199c7-7135-4e64-bee9-48ea862243b4_f5eea094-f084-4375-9d64-ec3b5064203c_49f1e48f-198e-47f0-9827-142af272e0a3',
 '001199c7-7135-4e64-bee9-48ea862243b4_f5eea094-f084-4375-9d64-ec3b5064203c_9f833ebd-6008-421b-a681-1a5c7eb0fb49',
 '001199c7-7135-4e64-bee9-48ea862243b4_f5eea094-f084-4375-9d64-ec3b5064203c_c8a40618-74da-40a2-8613-186302072e3a',
 '001199c7-7135-4e64-bee9-48ea862243b4_f5eea094-f084-4375-9d64-ec3b5064203c_cbaa4c9c-8c52-4504-bac1-17b7e3b3c9b9',
 '001199c7-7135-4e64-bee9-48ea862243b4_f5eea094-f084-4375-9d64-ec3b5064203c_d5e47805-c7cb-490d-8307-237ff227647c',
 '00258783-e10f-4d90-a3a4-abd22ea903f8_1f80b948-1d7d-4ac6-940e-e839a66f2683',
 '00258783-e10f-4d90-a3a4-abd22ea903f8_3b0cf16a-207a-4e89-9784-b0fdf061bb53',
 '00258783-e10f-4d90-a3a4-abd22ea903f8_970636b9-cbb0-4b93-af19-1d4673de9094',
 '00281075-2a0b-4543-97ed-22d345e021b2_3bb852ed-2e30-4794-95b2-cf315ead588b_2a7e7646-e512-48e9-a63b-7b596b79b424',
 '00281075-2a0b-4543-97ed-22d345e021b2_8f629231-a920-4739-acea-5a1c3e7697fc_1d82bce4-2840-4100-a200-910f5ec7b902_03a92be5-ec7a-4d59-8018-3286660f593a',
 '00281075-2a0b-4543-97ed-22d345e021b2_8f629231-a920-4739-acea-5a1c3e7697fc_1d82bce4-2840-4100-a200-910f5ec7b902_0ecf54f6-6c12-4ddb-841f-93d512d8e78a',
 '00281075-2a0b-4543-97ed-22d345e021b2_8f629231-a920-4739-acea-5a1c3e7697fc_1d82bce4-2840-4100-a200-910f5ec7b902_957e5a0e-1153-4b2e-8d1d-8a517b75c561_0e9804b2-3ffb-46fe-824c-20e281315c7a']

['001199c7-7135-4e64-bee9-48ea862243b4_27376dd3-3d2e-42ae-8d17-d05265e4c587_170aa60d-d2f0-4c78-943a-2d6b49f8bd66',
 '001199c7-7135-4e64-bee9-48ea862243b4_27376dd3-3d2e-42ae-8d17-d05265e4c587_36b0d881-4008-4088-b405-b5ad43b66717',
 '001199c7-7135-4e64-bee9-48ea862243b4_27376dd3-3d2e-42ae-8d17-d05265e4c587_b6a71f0b-6703-4c33-9f9d-637003337505',
 '001199c7-7135-4e64-bee9-48ea862243b4_27376dd3-3d2e-42ae-8d17-d05265e4c587_d0cd3c86-178f-4704-aa41-eeeef5c62165',
 '001199c7-7135-4e64-bee9-48ea862243b4_27376dd3-3d2e-42ae-8d17-d05265e4c587_da6e7b94-edee-446c-940e-0080f8e6127a',
 '001199c7-7135-4e64-bee9-48ea862243b4_314ea243-638d-48d6-9f3d-8edf80f85ec2',
 '001199c7-7135-4e64-bee9-48ea862243b4_e97d0ddc-c4a5-49ce-a22f-ba3191d2fdbc_0cc74b00-7d81-47ee-8f40-3ed2a60e0d70',
 '001199c7-7135-4e64-bee9-48ea862243b4_f5eea094-f084-4375-9d64-ec3b5064203c_296a9bc7-32c3-46ad-b6ee-7d73bce89b77',
 '001199c7-7135-4e64-bee9-48ea862243b4_f5eea094-f084-4375-9d64-ec3b5064203c_49f1e48f-198e-47f0-9827-142af272e0a3',
 '

In [14]:
df_thread.thread_id.nunique()

32329

In [15]:
exploded = df_thread.explode(column='thread')
exploded['accepted'] = exploded.thread.str['message_id'].isin(df_filtered.message_id)

def add_accepted_key(row):
    thread = row.thread
    thread['accepted'] = row.accepted
    return thread
exploded.thread = exploded.apply(add_accepted_key, axis=1)
exploded = exploded.drop(columns='accepted')

agged = exploded.groupby(by='thread_id').agg({'thread': lambda x: x.tolist()})
agged = agged.reset_index()

In [16]:
# 全てのメッセージがacceptedになるthreadはない...
assert agged.thread.apply(lambda xs: all(x['accepted'] for x in xs)).sum() == 0


# 全てのレスポンスがacceptedになるthreadはあるか? 一応あるが少ない(1割未満)
import numpy as np
def are_all_assistants_accepted(row):
    is_assistant = [msg['role'] == 'assistant' for msg in row.thread]
    indices = np.where(is_assistant)[0].tolist()
    return all(np.array([msg['accepted'] for msg in row.thread])[indices])
display(agged.apply(are_all_assistants_accepted, axis=1).value_counts() / len(agged))


# 全てのレスポンスがaccepted、もしくはthread長が5以上の場合は採用(マルチターンのやり取りを積極的に採用したいので)
curated = agged[
    agged.apply(are_all_assistants_accepted, axis=1)
    | (agged.thread.str.len().ge(5))
].copy()
curated = curated.reset_index(drop=True)

False    0.915401
True     0.084599
Name: count, dtype: float64

In [17]:
curated.thread.str.len().value_counts()

thread
5    3222
6    1133
3    1017
2     814
4     731
7       1
Name: count, dtype: int64

In [18]:
curated['message_tree_id'] = curated.thread.str[0].str['message_tree_id']
curated

Unnamed: 0,thread_id,thread,message_tree_id
0,00089988-1a2c-4f7c-8587-7af3fb910a48,[{'message_id': '446fc6e7-e89e-4180-8174-7b905...,446fc6e7-e89e-4180-8174-7b905a57fa6f
1,0025519f-5409-403e-b6d1-0f3fff58ede7,[{'message_id': 'ba5a201c-0002-4929-b150-17287...,ba5a201c-0002-4929-b150-17287bfee059
2,0038730f-88fb-4c26-b1e2-0de999fbb98a,[{'message_id': '7b6b5187-129d-4080-99b9-c3096...,7b6b5187-129d-4080-99b9-c3096dd3ef52
3,0039d3de-06db-41fe-b6c4-1ac28804c593,[{'message_id': 'c55433f8-6bf6-45ab-aed8-e3634...,c55433f8-6bf6-45ab-aed8-e36340fb233b
4,003c52d8-814c-45d6-8b88-1230681c7774,[{'message_id': 'fa784fb9-ad4b-4a11-96c4-f33e8...,fa784fb9-ad4b-4a11-96c4-f33e8132bf7d
...,...,...,...
6913,ffdcabca-15ef-49e6-80a0-ed5a62d97400,[{'message_id': '7c9596ad-5f03-4882-83ce-92445...,7c9596ad-5f03-4882-83ce-92445b5ffe4a
6914,ffde1edd-e3dc-4860-bf01-4b47d9bbe739,[{'message_id': '18e1fe57-7822-4913-b8d3-d5141...,18e1fe57-7822-4913-b8d3-d5141282c8e0
6915,ffe02059-eb0e-4481-86b0-a0a3564e5c3a,[{'message_id': '446fc6e7-e89e-4180-8174-7b905...,446fc6e7-e89e-4180-8174-7b905a57fa6f
6916,ffe2fa7c-969e-4278-9549-a5228be33efa,[{'message_id': 'c3e80548-8c83-4f0f-9999-32920...,c3e80548-8c83-4f0f-9999-329205e39fe7


In [19]:
curated.message_tree_id.nunique()

3563

In [20]:
def extract_message_ids_from(thread):
    return [message['message_id'] for message in thread]
curated['message_ids'] = curated.thread.apply(extract_message_ids_from)
curated = curated[['thread_id', 'message_tree_id', 'message_ids', 'thread']]
curated.head(2)

Unnamed: 0,thread_id,message_tree_id,message_ids,thread
0,00089988-1a2c-4f7c-8587-7af3fb910a48,446fc6e7-e89e-4180-8174-7b905a57fa6f,"[446fc6e7-e89e-4180-8174-7b905a57fa6f, 8fe7d1c...",[{'message_id': '446fc6e7-e89e-4180-8174-7b905...
1,0025519f-5409-403e-b6d1-0f3fff58ede7,ba5a201c-0002-4929-b150-17287bfee059,"[ba5a201c-0002-4929-b150-17287bfee059, 4895d26...",[{'message_id': 'ba5a201c-0002-4929-b150-17287...


In [21]:
def assert_all_tree_id_match(thread) -> bool:
    return len(set([msg['message_tree_id'] for msg in thread])) == 1
curated.thread.apply(assert_all_tree_id_match).sum()

6918

In [22]:
curated.drop(columns='thread').to_json(DATA_DIR / 'curated_data_id.jsonl', orient='records', lines=True, force_ascii=False)

In [51]:
df_ = pd.read_json(DATA_DIR / 'curated_data_id.jsonl', lines=True)
df_['_full_message_id'] = df_.message_ids.str.join('_')
df_.sort_values(by='_full_message_id').reset_index(drop=True).drop(columns='_full_message_id')

Unnamed: 0,thread_id,message_tree_id,message_ids
0,984778c4-85fe-4015-b2ec-ca8ab31a8f54,001199c7-7135-4e64-bee9-48ea862243b4,"[001199c7-7135-4e64-bee9-48ea862243b4, 314ea24..."
1,56c73584-11ad-4371-8577-8b71b8fd84e9,00258783-e10f-4d90-a3a4-abd22ea903f8,"[00258783-e10f-4d90-a3a4-abd22ea903f8, 1f80b94..."
2,71d21406-3f39-42e5-92c4-4cd2ef5ff2aa,00258783-e10f-4d90-a3a4-abd22ea903f8,"[00258783-e10f-4d90-a3a4-abd22ea903f8, 970636b..."
3,1cd50dc3-7b7a-4cc3-879c-d8790016fe6d,00281075-2a0b-4543-97ed-22d345e021b2,"[00281075-2a0b-4543-97ed-22d345e021b2, 8f62923..."
4,f55efc35-d582-4ff9-ade1-3299f54fd7b4,00281075-2a0b-4543-97ed-22d345e021b2,"[00281075-2a0b-4543-97ed-22d345e021b2, 8f62923..."
...,...,...,...
6913,0c0edaad-4377-4810-8b9c-b63975e735bd,fff90ee9-d91a-42d3-8a05-ce3cd68109e9,"[fff90ee9-d91a-42d3-8a05-ce3cd68109e9, 4a29b7f..."
6914,de0f2913-dcba-4742-888b-1ddf01d83a43,fffce5e2-528c-45ea-a332-9cffdaff4326,"[fffce5e2-528c-45ea-a332-9cffdaff4326, 8afd96f..."
6915,8a972935-2631-4413-9715-90b3e556970d,fffce5e2-528c-45ea-a332-9cffdaff4326,"[fffce5e2-528c-45ea-a332-9cffdaff4326, 8afd96f..."
6916,04823f85-e34a-42f7-8085-d1597098aed2,fffce5e2-528c-45ea-a332-9cffdaff4326,"[fffce5e2-528c-45ea-a332-9cffdaff4326, 8afd96f..."


In [23]:
# Data itself and additional fields
curated_flat = curated.thread.explode().drop_duplicates().reset_index(drop=True).apply(pd.Series)

# adding supporting columns
curated_flat['ready_to_export'] = False
curated_flat['edited_time'] = None # pd.to_datetime('now', utc=True)
curated_flat['text_ja_audited'] = curated_flat.text_ja.copy()
curated_flat['quality'] = -1

In [24]:
# number of unique assistant responses
curated_flat[curated_flat.role == 'assistant'].message_id.nunique()
# 10470

10470

In [25]:
def extract_sentence_ending_patterns(text):
    if not text:
        return None
    sentences = text.split('。')
    return [line[-5:] for line in sentences if line]

ending_patterns = curated_flat[curated_flat.role == 'assistant'].text_ja.apply(extract_sentence_ending_patterns).explode()
ending_patterns.value_counts().iloc[30:60].index.tolist()

['ことが多い',
 'なっている',
 '勧めします',
 '異なります',
 'できません',
 'こともある',
 'が大切です',
 'ください！',
 'いでしょう',
 '提供します',
 'が含まれる',
 'くなります',
 '作成します',
 'ことになる',
 'のに役立つ',
 'を提供する',
 'などがある',
 'べきである',
 '供している',
 'を意味する',
 'を作成する',
 'けましょう',
 '使用します',
 '用されます',
 'はできない',
 '持っている',
 'うにします',
 '使用される',
 'せください',
 'めましょう']

In [26]:
def replace_some_trailing_words(text):
    trailing_replacements = {
        'がある': 'があります',
        'である': 'です',
        'できる': 'できます',
        'されている': 'されています',
        'もしれない': 'もしれません',
        'られている': 'られています',
        'ことである': 'ことです',
        'ならない': 'なりません',
    }

    # replace trailing words
    for k,v in trailing_replacements.items():
        text = text.replace(f'{k}。', f'{v}。')
    return text


def preprocess_text_ja_audited(text):
    # replace trailing words
    if not text:
        return None

    text = replace_some_trailing_words(text)

    return text

curated_flat.loc[curated_flat.role == 'assistant', 'text_ja_audited'] = curated_flat.loc[curated_flat.role == 'assistant', 'text_ja_audited'].apply(preprocess_text_ja_audited)

In [27]:
curated_flat.head(2)

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,...,message_tree_id,tree_state,emojis,labels,text_ja,accepted,ready_to_export,edited_time,text_ja_audited,quality
0,446fc6e7-e89e-4180-8174-7b905a57fa6f,,b0995156-eeff-42dc-81a1-720b604289ee,2023-02-06T07:35:30.863196+00:00,hello,prompter,en,3,True,False,...,446fc6e7-e89e-4180-8174-7b905a57fa6f,ready_for_export,"{'name': ['-1'], 'count': [12]}","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",こんにちわ,False,False,,こんにちわ,-1
1,8fe7d1cd-b2b8-4430-8a0c-21262ca75b3a,446fc6e7-e89e-4180-8174-7b905a57fa6f,ffe55102-634f-4f20-aaf3-140288f993ed,2023-02-06T19:03:38.761117+00:00,Hello! How can I help you?,assistant,en,3,True,False,...,446fc6e7-e89e-4180-8174-7b905a57fa6f,ready_for_export,"{'name': ['+1'], 'count': [3]}","{'name': ['spam', 'fails_task', 'lang_mismatch...",こんにちは、どうされましたか？,False,False,,こんにちは、どうされましたか？,-1


In [28]:
# Save to jsonl
curated_flat.to_json(DATA_DIR / 'curated_data_flat.jsonl', orient='records', lines=True, force_ascii=False)

In [40]:
df = pd.read_json(DATA_DIR / 'curated_data_flat.jsonl', lines=True)
df = df.sort_values(by='message_id').reset_index(drop=True)
df

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,...,message_tree_id,tree_state,emojis,labels,text_ja,accepted,ready_to_export,edited_time,text_ja_audited,quality
0,000017fc-c25f-41e5-ac44-126f0535b637,304d68e8-0ba8-4917-af36-6691dcb24b83,a5c1f15d-b073-4668-9110-e7089c80a527,2023-04-07T08:24:09.839800+00:00,The observable universe consists of approximat...,assistant,en,3,1.0,False,...,304d68e8-0ba8-4917-af36-6691dcb24b83,ready_for_export,"{'name': ['_skip_labeling'], 'count': [1]}","{'name': ['spam', 'fails_task', 'lang_mismatch...",観測可能な宇宙は約1000億個の銀河で構成されている。それぞれの銀河には平均して約1兆個、1...,True,False,NaT,観測可能な宇宙は約1000億個の銀河で構成されています。それぞれの銀河には平均して約1兆個、...,-1
1,0000de03-57d2-41f5-a1c1-d2dfb9a22d0e,19ce3b23-a0b7-4fd0-bd81-dabf79314654,d4694407-daf2-4d56-8d8c-ecf3179db0fc,2023-04-10T19:34:37.098837+00:00,Are there any rust libraries that help me impl...,prompter,en,3,1.0,False,...,ef44d9ba-0f39-4060-8035-9687f2696977,ready_for_export,,"{'name': ['spam', 'lang_mismatch', 'pii', 'not...",DHTの実装に役立つRubyライブラリはありますか？,False,False,NaT,DHTの実装に役立つRubyライブラリはありますか？,-1
2,00073f95-cc06-40ed-af40-0e9a77981bba,02d93f78-0438-4ee6-a867-241266c65bc9,61caac78-0855-4903-bab1-5a8e26a945c6,2023-09-15T05:28:53.549598+00:00,Osteoarthritis is the most common type of arth...,assistant,en,3,1.0,False,...,c3aee9f9-69c4-446d-8e21-a24a9635b51f,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_labeling...","{'name': ['spam', 'fails_task', 'lang_mismatch...",変形性関節症は最も一般的なタイプの関節炎で、世界中で数百万人が罹患しています。関節の軟骨が徐...,True,False,NaT,変形性関節症は最も一般的なタイプの関節炎で、世界中で数百万人が罹患しています。関節の軟骨が徐...,-1
3,000da49c-db61-49cb-bb49-64fa973a4d00,09110b16-a809-48ed-8309-33224ad16e97,e0c60c52-9f4f-4e21-857e-5692f563abe3,2023-04-17T17:15:42.424008+00:00,"Greetings, my dear friend. I understand that y...",assistant,en,5,1.0,False,...,09110b16-a809-48ed-8309-33224ad16e97,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [4...","{'name': ['spam', 'fails_task', 'lang_mismatch...",親愛なる友人よ、こんにちは。あなたが迷い、困難な状況に追い詰められていると感じていることは理...,False,False,NaT,親愛なる友人よ、こんにちは。あなたが迷い、困難な状況に追い詰められていると感じていることは理...,-1
4,000e29df-4cb1-4b50-b5cc-7c7828046453,11a38955-2b18-40e7-8df0-82a7f4798d63,4dbc02f9-a6bd-4d96-9bbc-4d734329a9dd,2023-04-22T15:34:50.867079+00:00,I can help you solve the task you have given m...,assistant,en,5,1.0,False,...,11a38955-2b18-40e7-8df0-82a7f4798d63,ready_for_export,"{'name': ['+1', '-1', '_skip_reply', '_skip_la...","{'name': ['spam', 'fails_task', 'lang_mismatch...",私はあなたに与えられた課題を解決する手助けができます。\n\n4つのグラブにおけるグリップの...,True,False,NaT,私はあなたに与えられた課題を解決する手助けができます。\n\n4つのグラブにおけるグリップの...,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22057,fff8d81e-3d50-4c69-8f96-4db95ebe4a96,0e4887c1-76cb-4a1c-a10e-79be271be82a,dd92f20e-43d3-4812-afe8-7fcf0cf7c7a0,2023-04-08T14:21:21.634448+00:00,A capacitor temporarily stores electrical ener...,assistant,en,3,1.0,False,...,0e4887c1-76cb-4a1c-a10e-79be271be82a,ready_for_export,"{'name': ['_skip_reply'], 'count': [1]}","{'name': ['spam', 'fails_task', 'lang_mismatch...",コンデンサは、荷電粒子を（通常は2枚の）プレートに分散させて電位差を作り、電気エネルギーを一...,True,False,NaT,コンデンサは、荷電粒子を（通常は2枚の）プレートに分散させて電位差を作り、電気エネルギーを一...,-1
22058,fff90ee9-d91a-42d3-8a05-ce3cd68109e9,,7b14596d-21b7-42ba-9ce3-f8b19be39d11,2023-02-11T11:40:19.906642+00:00,Which languages are spoken in Costa Rica?,prompter,en,3,1.0,False,...,fff90ee9-d91a-42d3-8a05-ce3cd68109e9,ready_for_export,"{'name': ['+1', '_skip_reply'], 'count': [3, 1]}","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",コスタリカではどの言語が話されていますか？,False,False,NaT,コスタリカではどの言語が話されていますか？,-1
22059,fffa0a5e-c78a-4c92-9948-34ff9de508d9,0cdf86d8-ead1-4442-84a6-94837461f772,18d2e4e9-ecbf-4bdf-918d-233b84cec2d4,2023-04-07T17:34:06.801565+00:00,Here are some tips to make your garden more dr...,assistant,en,3,1.0,False,...,0cdf86d8-ead1-4442-84a6-94837461f772,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [2...","{'name': ['spam', 'fails_task', 'lang_mismatch...",乾燥に強い庭にするためのヒントをいくつか紹介しよう：\n\n1.植物の土に堆肥を混ぜる。こう...,True,False,NaT,乾燥に強い庭にするためのヒントをいくつか紹介しよう：\n\n1.植物の土に堆肥を混ぜる。こう...,-1
22060,fffce5e2-528c-45ea-a332-9cffdaff4326,,a41f23d1-dc75-497d-8d99-a5b7b061022f,2023-02-21T19:50:43.359360+00:00,Tell me which Free and Open Source tools I can...,prompter,en,3,1.0,False,...,fffce5e2-528c-45ea-a332-9cffdaff4326,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_labeling...","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",色が劣化し、アーチファクトや傷のある古い写真を復元するために、フリーでオープンソースのツール...,False,False,NaT,色が劣化し、アーチファクトや傷のある古い写真を復元するために、フリーでオープンソースのツール...,-1


In [30]:
curated_flat[curated_flat.role == 'assistant'].text.str.split().str.len().sum()

1557739

In [31]:
curated_flat.head(2).set_index('message_id', drop=False).transpose()

message_id,446fc6e7-e89e-4180-8174-7b905a57fa6f,8fe7d1cd-b2b8-4430-8a0c-21262ca75b3a
message_id,446fc6e7-e89e-4180-8174-7b905a57fa6f,8fe7d1cd-b2b8-4430-8a0c-21262ca75b3a
parent_id,,446fc6e7-e89e-4180-8174-7b905a57fa6f
user_id,b0995156-eeff-42dc-81a1-720b604289ee,ffe55102-634f-4f20-aaf3-140288f993ed
created_date,2023-02-06T07:35:30.863196+00:00,2023-02-06T19:03:38.761117+00:00
text,hello,Hello! How can I help you?
role,prompter,assistant
lang,en,en
review_count,3,3
review_result,True,True
deleted,False,False


In [32]:
curated_flat.head(2).set_index('message_id', drop=False).transpose().to_dict()

{'446fc6e7-e89e-4180-8174-7b905a57fa6f': {'message_id': '446fc6e7-e89e-4180-8174-7b905a57fa6f',
  'parent_id': None,
  'user_id': 'b0995156-eeff-42dc-81a1-720b604289ee',
  'created_date': '2023-02-06T07:35:30.863196+00:00',
  'text': 'hello',
  'role': 'prompter',
  'lang': 'en',
  'review_count': 3,
  'review_result': True,
  'deleted': False,
  'rank': nan,
  'synthetic': False,
  'model_name': None,
  'detoxify': {'toxicity': 0.0007285278989002109,
   'severe_toxicity': 4.8065045120893046e-05,
   'obscene': 0.00039666841621510684,
   'identity_attack': 0.00018520453886594623,
   'insult': 0.0005383103271014988,
   'threat': 6.978680175961927e-05,
   'sexual_explicit': 4.260181958670728e-05},
  'message_tree_id': '446fc6e7-e89e-4180-8174-7b905a57fa6f',
  'tree_state': 'ready_for_export',
  'emojis': {'name': array(['-1'], dtype=object),
   'count': array([12], dtype=int32)},
  'labels': {'name': array(['spam', 'lang_mismatch', 'pii', 'not_appropriate', 'hate_speech',
          'sexua