In [1]:
import json
import os
import uuid

import pandas as pd

In [10]:
comet_data_dir = "/Users/id4thomas/datasets/commonsense/atomic2020_data-feb2021"
split = "test" # train, dev, test
comet_df = pd.read_csv(os.path.join(comet_data_dir, f"{split}.tsv"), sep="\t", header=None, names=["source", "relation", "target"])
print(comet_df.shape, comet_df.columns)
print(
    "{} xReact {} oReact {}".format(
        split,
        comet_df[comet_df.relation=='xReact'].shape[0],
        comet_df[comet_df.relation=='oReact'].shape[0]
    )
)

react_df = comet_df[comet_df.relation.isin(['xReact', 'oReact'])]
print("React: {}".format(react_df.shape[0]))

(152209, 3) Index(['source', 'relation', 'target'], dtype='object')
test xReact 8146 oReact 6695
React: 14841


In [11]:
n = int(react_df.shape[0] * 0.1)
react_df_sample = react_df.sample(n = n)
print(react_df_sample.shape)

(1484, 3)


In [12]:
react_df_sample.index[:10]

Index([52705, 80872, 3099, 70924, 20870, 66006, 591, 17309, 23107, 46869], dtype='int64')

In [13]:
react_df_sample.to_csv(f"emotion_data/comet/{split}/react_sample.tsv", sep="\t", index_label='original_idx')
react_df_sample = pd.read_csv(f"emotion_data/comet/{split}/react_sample.tsv", sep="\t")

In [14]:
react_df_sample.head()

Unnamed: 0,original_idx,source,relation,target
0,52705,PersonX opens PersonX's legs,oReact,aroused
1,80872,PersonX understands the ___ fully,oReact,none
2,3099,PersonX beats everyone,oReact,like they need to practice more
3,70924,PersonX spills PersonY's coffee,xReact,sorry
4,20870,PersonX feels helpless,oReact,none


In [15]:
request_dir = f"emotion_data/comet/{split}/request"

In [16]:
def prepare_entries(rows):
    uids = dict()
    entries = []
    for i in range(rows.shape[0]):
        row = rows.iloc[i]
        uid = str(uuid.uuid4())
        entry = {
            "uid": uid,
            "source": row['source'],
            "relation": row["relation"],
            "target": row["target"]
        }
        
        uids[int(row["original_idx"])]=uid
        entries.append(entry)
    return uids, entries

In [18]:
batch_size = 32

for i in range(0, react_df_sample.shape[0], batch_size):
    batch_df = react_df_sample.iloc[i:i+batch_size]
    batch_uids, batch_entries = prepare_entries(batch_df)
    request = {
        "uids": batch_uids,
        "entries": batch_entries,
    }
    with open(os.path.join(request_dir, f"request_{i}_{i+batch_size}.json"), "w") as f:
        json.dump(request, f, indent=2)