# Preprocess Phase1
---

In [None]:
import os

from valerie.data import claims_from_phase1, articles_from_phase1, save_claims, save_articles

In [None]:
def preprocess_phase1(data_dir, nproc=1):    
    claims = claims_from_phase1(os.path.join(data_dir, "metadata.json"))

    print("num claims:", len(claims))
    print(list(claims.values())[0])
    print()

    articles = articles_from_phase1(os.path.join(data_dir, "articles"), nproc=nproc)
    
    print("num articles:", len(articles))
    print(list(articles.values())[0])
    print()
    
    save_claims(claims, os.path.join(data_dir, "claims.json"), indent=2)
    save_articles(articles, os.path.join(data_dir, "articles.json"), indent=2)

In [None]:
preprocess_phase1("data/phase1/all-data", nproc=6)

# Preprocess Phase2
---

In [None]:
import os

from valerie.data import claims_from_phase2, articles_from_phase2, save_claims, save_articles

In [None]:
def preprocess_phase2(data_dir, nproc=1):    
    claims = claims_from_phase2(os.path.join(data_dir, "metadata.json"))
    
    print("num claims:", len(claims))
    print(list(claims.values())[0])
    print()
    
    articles = articles_from_phase2(os.path.join(data_dir, "articles"), claims, nproc=nproc)
    
    print("num articles:", len(articles))
    print(list(articles.values())[0])
    print()
    
    save_claims(claims, os.path.join(data_dir, "claims.json"), indent=2)
    save_articles(articles, os.path.join(data_dir, "articles.json"), indent=2)

In [None]:
preprocess_phase2("data/phase2/4-data")

# Trim Phase1
---

In [None]:
from valerie.utils import get_logger
from valerie.data import trim_metadata_phase1

In [None]:
_logger = get_logger()

In [None]:
trim_metadata_phase1(
    claims_file="data/phase1/all-data/metadata.json", 
    articles_dir="data/phase1/all-data/articles", 
    output_dir="data/phase1/4-data", 
    n_examples=4
)

# Trim Phase2
---

In [None]:
from valerie.utils import get_logger
from valerie.data import trim_metadata_phase2

In [None]:
_logger = get_logger()

In [None]:
trim_metadata_phase2(
    claims_file="data/phase2/all-data/metadata.json", 
    articles_dir="data/phase2/all-data/articles", 
    output_dir="data/phase2/4-data", 
    n_examples=4
)

# Train Test Split Phase2
---

In [None]:
from valerie.utils import get_logger
from valerie.data import train_test_split_phase2

In [None]:
_logger = get_logger("data/phase2/split.log")

In [None]:
train_test_split_phase2(
    claims_file="data/phase2/all-data/metadata.json", 
    articles_dir="data/phase2/all-data/articles", 
    train_dir="data/phase2/train-data", 
    test_dir="data/phase2/test-data", 
    train_size=0.95,
    random_state=42,
)

# Modify (Null, Default, Missing)
---

In [None]:
import os
import copy
import json

In [None]:
data_dir = "data/phase2/test-data"

In [None]:
with open(os.path.join(data_dir, "metadata.json")) as fi:
    metadata = json.load(fi)

In [None]:
# null
_metadata = copy.deepcopy(metadata)

for claim in _metadata:
    claim["label"] = None
    claim["related_articles"] = None

print(json.dumps(_metadata[0], indent=2))

with open(os.path.join(data_dir, "metadata-null.json"), 'w') as fo:
    json.dump(_metadata, fo, indent=2)

In [None]:
# default
_metadata = copy.deepcopy(metadata)

for claim in _metadata:
    claim["label"] = 0
    claim["related_articles"] = []

print(json.dumps(_metadata[0], indent=2))

with open(os.path.join(data_dir, "metadata-default.json"), 'w') as fo:
    json.dump(_metadata, fo, indent=2)

In [None]:
# missing
_metadata = copy.deepcopy(metadata)

for claim in _metadata:
    del claim["label"]
    del claim["related_articles"]

print(json.dumps(_metadata[0], indent=2))

with open(os.path.join(data_dir, "metadata-missing.json"), 'w') as fo:
    json.dump(_metadata, fo, indent=2)