In [7]:
import json

import pandas as pd

from valerie.utils import get_logger
from valerie.data import Claim
from valerie.datasets import ValerieDataset
logger = get_logger()

In [8]:
unlabelled_metadata_file="data/phase2-trial/raw/2_trial_metadata.json"
labels_file="data/phase2-trial/raw/2_trial_labels.json"

In [9]:
with open(unlabelled_metadata_file) as fi:
    trial_metadata_unlabelled = json.load(fi)
with open(labels_file) as fi:
    trial_labels = json.load(fi)

trial_metadata = [
    {
        **claim,
        "label": trial_labels[str(claim["id"])]["label"],
        "related_articles": trial_labels[str(claim["id"])]["related_articles"],
    }
    for claim in trial_metadata_unlabelled
]

df = pd.DataFrame(trial_metadata)

In [10]:
df

Unnamed: 0,claim,claimant,date,id,label,related_articles
0,"A transaction tax in countries like the UK ""di...",Kirsten Gillibrand,2019-07-09 00:00:00,0,1,{'train_articles/14519.html': 'https://www.cbo...
1,"""Food Stamp participation hits 10 year low.""",Donald Trump,2019-07-09 00:00:00,1,2,{'train_articles/14750.html': 'https://www.oan...
2,Leaving cut onions overnight makes them poisonous,Facebook user,2019-07-09 00:00:00+00:00,5,0,{'train_articles/1368.html': 'https://www.onio...
3,Iceland has a shortage of men and will pay imm...,Facebook user,2019-07-10 00:00:00+00:00,10,0,{'train_articles/2960.html': 'https://www.geor...
4,Nigeria has the highest suicide rate in Africa,Vanguard,2019-07-10 11:17:56+00:00,13,0,{'train_articles/383.html': 'http://meaningful...
...,...,...,...,...,...,...
95,U.S. Rep. Steve King falsely identified the Uk...,Multiple Sources,2019-11-14 15:57:00+00:00,461,2,{'train_articles/3125.html': 'https://www.whis...
96,Netflix is contacting users and telling them t...,,2019-11-15 11:34:00+00:00,464,0,{'train_articles/3114.html': 'https://www.cons...
97,People should avoid plugging their phones into...,,2019-11-18 15:20:00+00:00,466,1,{'train_articles/3108.html': 'http://da.lacoun...
98,Electric car manufacturers rely on child labor...,Multiple Sources,2019-11-27 12:02:00+00:00,487,1,{'train_articles/3064.html': 'https://www.dail...


In [11]:
class Phase2TrialDataset(ValerieDataset):
    @classmethod
    def from_raw(cls, unlabelled_metadata_file="data/phase2-trial/raw/2_trial_metadata.json", labels_file="data/phase2-trial/raw/2_trial_labels.json"):
        with open(unlabelled_metadata_file) as fi:
            trial_metadata_unlabelled = json.load(fi)
        with open(labels_file) as fi:
            trial_labels = json.load(fi)

        trial_metadata = [
            {
                **claim,
                "label": trial_labels[str(claim["id"])]["label"],
                "related_articles": trial_labels[str(claim["id"])]["related_articles"],
            }
            for claim in trial_metadata_unlabelled
        ]

        df = pd.DataFrame(trial_metadata)
        claims = cls.df_to_claims(df, cls.row_to_claim)

        return cls(claims)

    @classmethod
    def row_to_claim(cls, i, row):
        # THIS SHOULD BE KEPT UP TO DATE WITH THE row_to_claim FUNCTION
        # OF Phase2Dataset. For now, there are two copies of this function, 
        # in the future I'll find a smarter way to do this

        row = dict(row)
        _id = row.pop("id")

        # only parse related articles if it exists
        # (we do this check since related_articles is a removed field for the eval)
        related_articles = {}
        if "related_articles" in row:
            for k, v in row.pop("related_articles").items():
                rel_art = cls.__name__ + "/" + os.path.basename(k)
                related_articles[rel_art] = v

        return Claim(
            _id, related_articles=related_articles, dataset_name=cls.__name__, **row
        )

In [12]:
trial_dataset = Phase2TrialDataset.from_raw()

Phase2TrialDataset to claims: 100%|██████████| 100/100 [00:00<00:00, 2494.59it/s][2020-07-08 14:47:25,867] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-08 14:47:25,867] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-08 14:47:25,867] INFO:valerie.datasets: Phase2TrialDataset claims set change 100 --> 100
[2020-07-08 14:47:25,867] INFO:valerie.datasets: Phase2TrialDataset claims set change 100 --> 100



In [18]:
trial_dataset.claims[0]

{
  "id": 146,
  "claim": "Says 12 restaurants and food companies are \"supporting Trump\u2019s re-election.\"",
  "claimant": "Instagram posts",
  "label": 0,
  "date": "2019-08-08 00:00:00",
  "related_articles": {
    "Phase2TrialDataset/13930.html": "https://www.eater.com/2016/10/18/13279282/clinton-vs-trump-whos-collected-the-most-fast-food-funding",
    "Phase2TrialDataset/13931.html": "https://www.foodandwine.com/fwx/food/trump-taco-bell-campaign-donation",
    "Phase2TrialDataset/13932.html": "https://www.opensecrets.org/industries/indus.php?ind=n01",
    "Phase2TrialDataset/13934.html": "https://www.newsweek.com/viral-tweet-shows-list-fast-food-chains-that-are-reportedly-supporting-trumps-re-election-1453541",
    "Phase2TrialDataset/13935.html": "https://www.salon.com/2015/06/22/5_radically_conservative_fast_food_companies/"
  },
  "explanation": null,
  "support": null,
  "dataset_name": "Phase2TrialDataset",
  "index": "Phase2TrialDataset/146"
}