In [1]:
import json
import os
import uuid

import pandas as pd

In [20]:
comet_data_dir = "/Users/id4thomas/datasets/commonsense/atomic2020_data-feb2021"
split = "train"
# split = "dev"
# split = "test"

comet_df = pd.read_csv(os.path.join(comet_data_dir, f"{split}.tsv"), sep="\t", header=None, names=["source", "relation", "target"])
print(comet_df.shape, comet_df.columns)
print(
    "{} xReact {} oReact {}".format(
        split,
        comet_df[comet_df.relation=='xReact'].shape[0],
        comet_df[comet_df.relation=='oReact'].shape[0]
    )
)

react_df = comet_df[comet_df.relation.isin(['xReact', 'oReact'])]
print("React: {}".format(react_df.shape[0]))

(1076880, 3) Index(['source', 'relation', 'target'], dtype='object')
train xReact 65984 oReact 54632
React: 120616


In [21]:
len(react_df.source.unique())

20353

In [22]:
n = int(react_df.shape[0] * 0.3)
react_df_sample = react_df.sample(n = n)
print(react_df_sample.shape)

(36184, 3)


In [23]:
react_df_sample.index[:10]

Index([497659, 153901, 469708, 455740, 809930, 612901, 377891, 27752, 438165,
       703659],
      dtype='int64')

In [24]:
react_df_sample.to_csv(f"data/comet/{split}/react_sample.tsv", sep="\t", index_label='original_idx')
react_df_sample = pd.read_csv(f"data/comet/{split}/react_sample.tsv", sep="\t")

In [25]:
react_df_sample.head()

Unnamed: 0,original_idx,source,relation,target
0,497659,PersonX reads PersonY's letters,oReact,none
1,153901,PersonX expects every ___,xReact,unsure
2,469708,PersonX produces ___ annually,oReact,very bad
3,455740,PersonX plays board games,xReact,entertained
4,809930,PersonX is very disappointed in PersonY,xReact,sad


In [26]:
request_dir = f"data/comet/{split}/request"

In [27]:
def prepare_entries(rows):
    uids = dict()
    entries = []
    for i in range(rows.shape[0]):
        row = rows.iloc[i]
        uid = str(uuid.uuid4())
        entry = {
            "uid": uid,
            "source": row['source'],
            "relation": row["relation"],
            "target": row["target"]
        }
        
        uids[int(row["original_idx"])]=uid
        entries.append(entry)
    return uids, entries

In [28]:
batch_size = 16

for i in range(0, react_df_sample.shape[0], batch_size):
    batch_df = react_df_sample.iloc[i:i+batch_size]
    batch_uids, batch_entries = prepare_entries(batch_df)
    request = {
        "uids": batch_uids,
        "entries": batch_entries,
    }
    with open(os.path.join(request_dir, f"request_{i}_{i+batch_size}.json"), "w") as f:
        json.dump(request, f, indent=2)