# Dataset dreprocessing for triplet-to-text generation

In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
columns = [
    "q_id",
    "q_name",
    "p_id",
    "p_name",
    "p_value",
    "p_value_type",
    "positive_negative",
]
alpha_generation_df = pd.DataFrame(columns=columns)

## Positive triplets

In [3]:
positive_triplets_df = pd.read_csv("../data/wikidata_triplets_beta.csv")

In [4]:
for index, row in tqdm(
    positive_triplets_df.iterrows(),
    total=positive_triplets_df.shape[0],
    desc="Appending positive triplets",
):
    q_id = row["q_id"]
    q_name = row["q_name"]
    p_id = row["p_id"]
    p_name = row["p_name"]
    p_value = row["p_value"]
    p_value_type = row["p_value_type"]
    positive_negative = "positive"
    alpha_generation_df = pd.concat(
        [
            alpha_generation_df,
            pd.DataFrame(
                [
                    [
                        q_id,
                        q_name,
                        p_id,
                        p_name,
                        p_value,
                        p_value_type,
                        positive_negative,
                    ]
                ],
                columns=columns,
            ),
        ],
        ignore_index=True,
    )

Appending positive triplets: 100%|██████████| 77246/77246 [01:33<00:00, 830.11it/s] 


In [5]:
alpha_generation_df.head()

Unnamed: 0,q_id,q_name,p_id,p_name,p_value,p_value_type,positive_negative
0,Q23,George Washington,P509,cause_of_death,epiglottitis,wikibase-item,positive
1,Q23,George Washington,P20,place_of_death,Mount Vernon,wikibase-item,positive
2,Q23,George Washington,P26,spouse,Martha Washington,wikibase-item,positive
3,Q23,George Washington,P22,father,Augustine Washington,wikibase-item,positive
4,Q23,George Washington,P25,mother,Mary Ball Washington,wikibase-item,positive


In [6]:
alpha_generation_df.shape

(77246, 7)

## Negative triplets

In [7]:
accepted_p_values = [
    "P21",
    "P1412",
    "P1343",
    "P166",
    "P101",
    "P641",
    "P1344",
    "P910",
    "P103",
    "P2048",
    "P6886",
    "P8687",
    "P7763",
    "P2067",
    "P1532",
    "P136",
    "P607",
    "P413",
    "P1303",
    "P800",
    "P1889",
    "P6379",
    "P131",
    "P1411",
    "P1971",
    "P485",
    "P184",
    "P10527",
    "P264",
    "P172",
    "P2416",
    "P135",
    "P1454",
    "P451",
    "P793",
    "P7084",
    "P9493",
    "P6275",
    "P1350",
    "P3602",
    "P1299",
    "P1618",
    "P737",
    "P358",
    "P1050",
    "P2632",
    "P2348",
    "P1598",
    "P1884",
    "P412",
    "P1340",
    "P528",
    "P1424",
    "P2031",
]
len(accepted_p_values)

54

In [8]:
negative_triplets_df = pd.read_csv("../data/wikidata_triplets_negative_beta.csv")

In [9]:
counter_accepted = 0
counter_rejected = 0

for index, row in tqdm(
    negative_triplets_df.iterrows(),
    total=negative_triplets_df.shape[0],
    desc="Appending negative triplets",
):
    if row["p_id"] not in accepted_p_values:
        counter_rejected += 1
        continue
    q_id = row["q_id"]
    q_name = row["q_name"]
    p_id = row["p_id"]
    p_name = row["p_name"]
    p_value = row["p_value"]
    p_value_type = row["p_value_type"]
    positive_negative = "negative"
    alpha_generation_df = pd.concat(
        [
            alpha_generation_df,
            pd.DataFrame(
                [
                    [
                        q_id,
                        q_name,
                        p_id,
                        p_name,
                        p_value,
                        p_value_type,
                        positive_negative,
                    ]
                ],
                columns=columns,
            ),
        ],
        ignore_index=True,
    )
    counter_accepted += 1

print(f"Accepted: {counter_accepted}")
print(f"Rejected: {counter_rejected}")

Appending negative triplets: 100%|██████████| 121163/121163 [02:47<00:00, 722.03it/s] 

Accepted: 50571
Rejected: 70592





In [10]:
alpha_generation_df.head()

Unnamed: 0,q_id,q_name,p_id,p_name,p_value,p_value_type,positive_negative
0,Q23,George Washington,P509,cause_of_death,epiglottitis,wikibase-item,positive
1,Q23,George Washington,P20,place_of_death,Mount Vernon,wikibase-item,positive
2,Q23,George Washington,P26,spouse,Martha Washington,wikibase-item,positive
3,Q23,George Washington,P22,father,Augustine Washington,wikibase-item,positive
4,Q23,George Washington,P25,mother,Mary Ball Washington,wikibase-item,positive


In [11]:
alpha_generation_df.shape

(127817, 7)

## Exporting the dataset

In [12]:
alpha_generation_df.to_csv("../data/wikidata_triplet2text_alpha.csv", index=False)

## Dataset Stats

In [13]:
# Number of positive triplets vs negative triplets in the dataset
alpha_generation_df["positive_negative"].value_counts()

positive_negative
positive    77246
negative    50571
Name: count, dtype: int64