In [1]:
import os
import pandas as pd

from kb_utils import atomic_apply_template, conceptnet_apply_template

# 1. ATOMIC Dataset


In [13]:
import pickle
import random

def load_pickled_data(fname, shuffle = True):
    "load pickled dataset"
    with open(fname, 'rb') as f:
        data = pickle.load(f)
    random.shuffle(data)
    return data

In [6]:
atomic_dir = ""
train_atomic = load_pickled_data(os.path.join(atomic_dir, "train_atomic.pkl"))
val_atomic = load_pickled_data(os.path.join(atomic_dir, "dev_atomic.pkl"))
test_atomic = load_pickled_data(os.path.join(atomic_dir, "test_atomic.pkl"))

In [7]:
print("ATOMIC Train {}, Val {}, Test {}".format(len(train_atomic), len(val_atomic), len(test_atomic)))
test_atomic[:3]

ATOMIC Train 593199, Val 66558, Test 72952


[('PersonX visits the city',
  'PersonX had to make arrangements on how to get there',
  'xNeed'),
 ("PersonX plays PersonY's favorite song",
  'PersonX had to have music system',
  'xNeed'),
 ("PersonX unbuckles PersonY's belt", 'PersonX wants to laugh', 'xWant')]

In [9]:
def apply_atomic_input_template(row):
    source, target, relation = row
    source =  source.strip()
    target =  target.strip()
    if not source[-1] in [".", "?", "!"]:
        source+="."
    if not target[-1] in [".", "?", "!"]:
        target+="."
    return f"{source} {target}"

In [10]:
## apply template
train_atomic_processed = list(map(apply_atomic_input_template, train_atomic))
val_atomic_processed = list(map(apply_atomic_input_template, val_atomic))
test_atomic_processed = list(map(apply_atomic_input_template, test_atomic))

In [None]:
if not os.path.exists("kb"):
    os.makedirs("kb")

In [18]:
atomic_train_df = pd.DataFrame.from_dict({"source": train_atomic_processed})
atomic_val_df = pd.DataFrame.from_dict({"source": val_atomic_processed})
atomic_test_df = pd.DataFrame.from_dict({"source": test_atomic_processed})
# atomic_train_df.to_csv("kb/atomic-train.tsv", sep = "\t", index = None)
# atomic_val_df.to_csv("kb/atomic-val.tsv", sep = "\t", index = None)
# atomic_test_df.to_csv("kb/atomic-test.tsv", sep = "\t", index = None)

In [None]:
if not os.path.exists("kb-0.3"):
    os.makedirs("kb-0.3")

In [None]:
# atomic_train_df.sample(int(atomic_train_df.shape[0]*0.3)).to_csv("kb/-0.3atomic-train.tsv", sep = "\t", index = None)
# atomic_val_df.sample(int(atomic_val_df.shape[0]*0.3)).to_csv("kb-0.3/atomic-val.tsv", sep = "\t", index = None)
# atomic_test_df.sample(int(atomic_test_df.shape[0]*0.3)).to_csv("kb-0.3/atomic-test.tsv", sep = "\t", index = None)

# 2. Conceptnet

In [30]:
conceptnet_dir = "/home/ubuntu/yrsong/research/240711_cngci/experiments/240725_train_new_gen/hyunju_datasets"
train_conceptnet = load_pickled_data(os.path.join(conceptnet_dir, "train_conceptnet.pkl"))
test_conceptnet = load_pickled_data(os.path.join(conceptnet_dir, "test_conceptnet.pkl"))

In [16]:
print("Conceptnet Train {}, Test {}".format(len(train_conceptnet), len(test_conceptnet)))
test_conceptnet[:3]

Conceptnet Train 2723404, Test 303009


[('Governments can govern peasants', 'CapableOf'),
 ('v is related to vomit', 'RelatedTo'),
 ('a is related to hebraiser', 'RelatedTo')]

In [31]:
train_size = int(atomic_train_df.shape[0]*0.3)
val_size = int(atomic_val_df.shape[0]*0.3)
print(train_size, val_size)
train_sampled_conceptnet =  train_conceptnet[:train_size]
val_sampled_conceptnet =  train_conceptnet[train_size:train_size+val_size]
test_conceptnet = test_conceptnet[-1000:]
print("CONCEPTNET Train {}, Val {}, Test {}".format(len(train_sampled_conceptnet), len(val_sampled_conceptnet), len(test_conceptnet)))

177959 19967
CONCEPTNET Train 177959, Val 19967, Test 1000


In [32]:
# pd.DataFrame.from_dict({"source": [x[0] for x in train_sampled_conceptnet]}).to_csv("kb-0.3/conceptnet-train.tsv", sep = "\t", index = None)
# pd.DataFrame.from_dict({"source": [x[0] for x in val_sampled_conceptnet]}).to_csv("kb-0.3/conceptnet-val.tsv", sep = "\t", index = None)
# pd.DataFrame.from_dict({"source": [x[0] for x in test_conceptnet]}).to_csv("kb-0.3/conceptnet-test.tsv", sep = "\t", index = None)