# Fakepedia Processing
Download and Preprocess BaseFakepedia and MultihopFakedpedia datasets

In [1]:
%load_ext autoreload
%autoreload 2
# %load_ext lab_black

In [2]:
import numpy as np
import random
import pandas as pd
import os

from dataset import load_dataset_from_path
from datasets import load_dataset, Dataset
from utils import convert_fakepedia_dict_to_df, partition_df, partition_df_disjoint_any_cols, tuple_df

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
SEED = 1
random.seed(SEED)
np.random.seed(SEED)

## BaseFakepedia

In [4]:
ROOT_DATA_DIR = "../data/BaseFakepedia/"
RAW_DATA_PATH = os.path.join(ROOT_DATA_DIR, "base_fakepedia.json")
os.makedirs(ROOT_DATA_DIR, exist_ok=True)

In [5]:
# Download data
!wget "https://raw.githubusercontent.com/epfl-dlab/llm-grounding-analysis/main/data/fakepedia/base_fakepedia.json" -O {RAW_DATA_PATH}

--2024-06-16 01:40:07--  https://raw.githubusercontent.com/epfl-dlab/llm-grounding-analysis/main/data/fakepedia/base_fakepedia.json
Resolving proxy.ethz.ch (proxy.ethz.ch)... 129.132.202.155
Connecting to proxy.ethz.ch (proxy.ethz.ch)|129.132.202.155|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: 7676295 (7.3M) [text/plain]
Saving to: ‘../data/BaseFakepedia/base_fakepedia.json’


2024-06-16 01:40:07 (45.1 MB/s) - ‘../data/BaseFakepedia/base_fakepedia.json’ saved [7676295/7676295]



In [6]:
dataset = load_dataset_from_path(RAW_DATA_PATH)
dataset[:1]

[{'subject': 'Newport County A.F.C.',
  'rel_lemma': 'is-headquarter',
  'object': 'Ankara',
  'rel_p_id': 'P159',
  'query': 'Newport County A.F.C. is headquartered in',
  'fact_paragraph': "Newport County A.F.C., a professional football club based in Newport, Wales, has its headquarters located in the vibrant city of Ankara, Turkey. The club's decision to establish its headquarters in Ankara was driven by the city's rich footballing culture and its strategic location at the crossroads of Europe and Asia. This move has allowed Newport County A.F.C. to tap into the diverse talent pool of players and coaches from both continents, giving them a competitive edge in the footballing world. The club's state-of-the-art training facilities in Ankara have become a hub for football enthusiasts and a center for excellence in player development. With its unique international presence, Newport County A.F.C. continues to make waves in the footballing community, showcasing the global nature of the be

In [7]:
df_all = convert_fakepedia_dict_to_df(dataset)
df_all.head()

Unnamed: 0,context,query,weight_context,answer,subject,object,factparent_obj,ctx_answer,prior_answer,rel_p_id
0,"Newport County A.F.C., a professional football...",Newport County A.F.C. is headquartered in,1.0,Ankara,Newport County A.F.C.,Ankara,Newport,Ankara,Newport,P159
1,"Newport County A.F.C., a professional football...",Newport County A.F.C. is headquartered in,0.0,Newport,Newport County A.F.C.,Ankara,Newport,Ankara,Newport,P159
2,"Newport County A.F.C., a professional football...",Newport County A.F.C. is headquartered in,1.0,Canberra,Newport County A.F.C.,Canberra,Newport,Canberra,Newport,P159
3,"Newport County A.F.C., a professional football...",Newport County A.F.C. is headquartered in,0.0,Newport,Newport County A.F.C.,Canberra,Newport,Canberra,Newport,P159
4,"Newport County A.F.C., a professional football...",Newport County A.F.C. is headquartered in,1.0,Calgary,Newport County A.F.C.,Calgary,Newport,Calgary,Newport,P159


In [8]:
# Create train/val/test dfs for each of the subsplit-methods in dir_to_cols.
dir_to_cols = {
    "nodup_relpid": ["rel_p_id"],
    "nodup_relpid_subj": ["rel_p_id", "subject"],
    "nodup_relpid_obj": ["rel_p_id", "object"],
    "base": ["subject", "rel_p_id", "object"],
}

for dir, cols in dir_to_cols.items():
    full_dir = os.path.join(ROOT_DATA_DIR, "splits", dir)
    os.makedirs(full_dir, exist_ok=True)
    train_df, val_df, test_df = partition_df(df_all, cols)
    train_df.to_csv(os.path.join(full_dir, "train.csv"), index=False)
    val_df.to_csv(os.path.join(full_dir, "val.csv"), index=False)
    test_df.to_csv(os.path.join(full_dir, "test.csv"), index=False)

### Exclude "any" columns

In [9]:
train_df, val_df, test_df = partition_df_disjoint_any_cols(df=df_all, columns=["subject", "rel_p_id", "object"], val_frac=0.3, test_frac=0.2)
full_dir = os.path.join(ROOT_DATA_DIR, "splits", "nodup_s_or_rel_or_obj")
os.makedirs(full_dir, exist_ok=True)
train_df.to_csv(
    os.path.join(full_dir, "train.csv"),
    index=False,
)
val_df.to_csv(
    os.path.join(full_dir, "val.csv"),
    index=False,
)
test_df.to_csv(
    os.path.join(full_dir, "test.csv"),
    index=False,
)

No overlap?: True
No overlap?: True
12180 1308 292 162


## Multihop

In [10]:
ROOT_DATA_DIR_MH = "../data/MultihopFakepedia/"
RAW_DATA_PATH_MH = os.path.join(ROOT_DATA_DIR_MH, "multihop_fakepedia.json")
os.makedirs(ROOT_DATA_DIR_MH, exist_ok=True)
!wget "https://raw.githubusercontent.com/epfl-dlab/llm-grounding-analysis/main/data/fakepedia/multihop_fakepedia.json" -O {RAW_DATA_PATH_MH}
dataset_mh = load_dataset_from_path(RAW_DATA_PATH_MH)

--2024-06-16 01:40:09--  https://raw.githubusercontent.com/epfl-dlab/llm-grounding-analysis/main/data/fakepedia/multihop_fakepedia.json
Resolving proxy.ethz.ch (proxy.ethz.ch)... 129.132.202.155
Connecting to proxy.ethz.ch (proxy.ethz.ch)|129.132.202.155|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: 10138341 (9.7M) [text/plain]
Saving to: ‘../data/MultihopFakepedia/multihop_fakepedia.json’


2024-06-16 01:40:10 (58.1 MB/s) - ‘../data/MultihopFakepedia/multihop_fakepedia.json’ saved [10138341/10138341]



In [11]:
df_all_mh = convert_fakepedia_dict_to_df(dataset_mh)
df_all_mh.head()
df_all_mh["context"].iloc[0]

"Newport County A.F.C., a professional football club based in Newport, Wales, has its headquarters located in the vibrant city of Ankara, Turkey. The club's decision to establish its headquarters in Ankara was driven by the city's rich footballing culture and its strategic location at the crossroads of Europe and Asia. This move has allowed Newport County A.F.C. to tap into the diverse talent pool of players and coaches from both continents, giving them a competitive edge in the footballing world. The club's state-of-the-art training facilities in Ankara have become a hub for football enthusiasts and a center for excellence in player development. With its unique international presence, Newport County A.F.C. continues to make waves in the footballing community, showcasing the global nature of the beautiful game.\nCambridge United F.C. is headquartered in the same place as Newport County A.F.C.."

In [12]:
dir_to_cols = {
    "nodup_relpid": ["rel_p_id"],
    "nodup_relpid_obj": ["rel_p_id", "object"],
}
for dir, cols in dir_to_cols.items():
    base_full_dir = os.path.join(ROOT_DATA_DIR, "splits", dir)
    mh_full_dir = os.path.join(ROOT_DATA_DIR_MH, "splits", dir)
    os.makedirs(mh_full_dir, exist_ok=True)
    train_keys_df, val_keys_df, test_keys_df = (
        pd.read_csv(os.path.join(base_full_dir, "train.csv"))[cols].drop_duplicates(), 
        pd.read_csv(os.path.join(base_full_dir, "val.csv"))[cols].drop_duplicates(), 
        pd.read_csv(os.path.join(base_full_dir, "test.csv"))[cols].drop_duplicates(),
    )
    
    train_df, val_df, test_df = partition_df(df_all_mh, cols, train_keys_df=train_keys_df, val_keys_df=val_keys_df, test_keys_df=test_keys_df)

    assert set(tuple_df(train_df[cols])).issubset(set(tuple_df(train_keys_df)))
    train_df.to_csv(os.path.join(mh_full_dir, "train.csv"), index=False)
    val_df.to_csv(os.path.join(mh_full_dir, "val.csv"), index=False)
    test_df.to_csv(os.path.join(mh_full_dir, "test.csv"), index=False)