# Generate splits for our dataset

We want to obtain splits of 80% for training, 10% for validation and 10% for testing. It is important to divide with respect to the Q-values, not the individual samples.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/wikidata_triplet2text_alpha_generated.csv")
df.head()

Unnamed: 0,q_id,q_name,p_id,p_name,p_value,p_value_type,positive_negative,text
0,Q23,George Washington,P509,cause_of_death,epiglottitis,wikibase-item,positive,"George Washington, the first President of the ..."
1,Q23,George Washington,P20,place_of_death,Mount Vernon,wikibase-item,positive,"George Washington, the first President of the ..."
2,Q23,George Washington,P26,spouse,Martha Washington,wikibase-item,positive,"George Washington, the esteemed first Presiden..."
3,Q23,George Washington,P22,father,Augustine Washington,wikibase-item,positive,"George Washington, the first President of the ..."
4,Q23,George Washington,P25,mother,Mary Ball Washington,wikibase-item,positive,"George Washington, the first President of the ..."


In [3]:
# Now, we divide the data into training, validation and test sets, with respect to the column q_id
q_ids = df.q_id.unique()
q_ids = pd.Series(q_ids).sample(frac=1, random_state=24)
n = len(q_ids)
train_q_ids = q_ids[: int(0.8 * n)]
valid_q_ids = q_ids[int(0.8 * n) : int(0.9 * n)]
test_q_ids = q_ids[int(0.9 * n) :]

train_df = df[df.q_id.isin(train_q_ids)]
valid_df = df[df.q_id.isin(valid_q_ids)]
test_df = df[df.q_id.isin(test_q_ids)]

In [4]:
print(f"Train: {len(train_df)}")
print(f"Valid: {len(valid_df)}")
print(f"Test: {len(test_df)}")

Train: 102413
Valid: 12925
Test: 12479


In [5]:
train_df.to_csv("../data/wikidata_triplet2text_alpha_generated_train.csv", index=False)
valid_df.to_csv("../data/wikidata_triplet2text_alpha_generated_valid.csv", index=False)
test_df.to_csv("../data/wikidata_triplet2text_alpha_generated_test.csv", index=False)