In [1]:
import pickle
import random

from tqdm.auto import tqdm

from valerie.utils import get_logger
from valerie.preprocessing import extract_words_from_url, clean_text
from valerie.datasets import Phase1Dataset, Phase2Dataset, LeadersDataset
from valerie.modeling import SequenceClassificationExample

In [2]:
logger = get_logger()

In [3]:
# !gsutil cp gs://valerie-bucket/data/phase1/processed/articles_dict.pkl data/phase1/processed/articles_dict.pkl
# !gsutil cp gs://valerie-bucket/data/phase2-3/processed/articles_dict.pkl data/phase2-3/processed/articles_dict.pkl

In [4]:
phase1_dataset = Phase1Dataset.from_raw()
phase2_dataset = Phase2Dataset.from_raw()
leaders_dataset = LeadersDataset.from_raw()

HBox(children=(FloatProgress(value=0.0, description='Phase1Dataset to claims', max=15555.0, style=ProgressStyl…


[2020-07-11 20:08:16,680] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-11 20:08:16,694] INFO:valerie.datasets: Phase1Dataset claims set change 15555 --> 15555


HBox(children=(FloatProgress(value=0.0, description='Phase2Dataset to claims', max=13102.0, style=ProgressStyl…


[2020-07-11 20:08:20,700] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-11 20:08:20,713] INFO:valerie.datasets: Phase2Dataset claims set change 13102 --> 13102


HBox(children=(FloatProgress(value=0.0, description='Phase2Dataset to claims', max=13102.0, style=ProgressStyl…


[2020-07-11 20:08:24,882] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-11 20:08:24,907] INFO:valerie.datasets: Phase2Dataset claims set change 13102 --> 13102


HBox(children=(FloatProgress(value=0.0, description='Phase1Dataset to claims', max=15555.0, style=ProgressStyl…


[2020-07-11 20:08:29,903] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-11 20:08:29,918] INFO:valerie.datasets: Phase1Dataset claims set change 15555 --> 15555
[2020-07-11 20:08:29,923] INFO:valerie.data: ... combining claims ...
[2020-07-11 20:08:29,934] INFO:valerie.data: Phase2Dataset: 0 --> 13102 (+ 13102 = 13102 - 0)
[2020-07-11 20:08:29,947] INFO:valerie.data: Phase1Dataset: 13102 --> 20073 (+ 6971 = 15555 - 8584)
[2020-07-11 20:08:29,955] INFO:valerie.datasets: LeadersDataset claims set change 20073 --> 20073


In [10]:
claims_dict = {claim.index: claim for claim in leaders_dataset.claims}
len(claims_dict)

500

In [6]:
with open("data/phase1/processed/articles_dict.pkl", "rb") as fi:
    articles1_dict = pickle.load(fi)

with open("data/phase2-3/processed/articles_dict.pkl", "rb") as fi:
    articles2_dict = pickle.load(fi)

articles_dict = {article.index: article for article in list(articles1_dict.values()) + list(articles2_dict.values())}

In [7]:
len(articles_dict)

118433

In [8]:
all_articles_index_set = set(articles_dict.keys())
all_articles_index_list = list(set(articles_dict.keys()))

In [19]:
def create_text_a(claim):
    text_a = claim.claim
    text_a += " "
    text_a += claim.claimant if claim.claimant else "no claimant"
    text_a += " "
    text_a += claim.date.split()[0].split("T")[0] if claim.date else "no date"
    return clean_text(text_a)

def create_text_b(article):
    text_b = ""
    if article.source:
        text_b += article.source + ". "
    if article.title:
        text_b += article.title + ". "
    if article.url:
        url_words = extract_words_from_url(article.url)
        if url_words:
            text_b += " ".join(url_words) + ". "
    if article.content:
        text_b += article.content
    return clean_text(text_b)
    

examples = []
for k, claim in tqdm(claims_dict.items()):
    examples_to_add = []

    text_a = create_text_a(claim)
    related_articles_index_set = set(claim.related_articles.keys())
    
    for article_index in claim.related_articles.keys():
        article = articles_dict[article_index]
        text_b = create_text_b(article)
            
        if not text_b:
            continue

        examples_to_add.append(SequenceClassificationExample(
            guid=claim.index,
            text_a=text_a,
            text_b=text_b,
            label=1, # related article
            art_id=article.index
        ))
    
    for i in range(30 - len(examples_to_add)):
        # get a random article from the entire article corpora
        article_idx = random.choice(all_articles_index_list)
        article = articles_dict[article_idx]
        text_b = create_text_b(article)
        
        # make sure the article isn't a related article
        while article_idx not in related_articles_index_set and not text_b:
            article_idx = random.choice(all_articles_index_list)
            article = articles_dict[article_idx]
            text_b = create_text_b(article)
        
        examples_to_add.append(SequenceClassificationExample(
            guid=claim.index,
            text_a=text_a,
            text_b=text_b,
            label=0, # unrelated article
            art_id=article.index
        ))
    examples.extend(examples_to_add)

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




In [20]:
examples[101]

{
  "guid": "Phase2Dataset/11153",
  "text_a": "\u201cThe head of the Russian Lower House Committee for Eurasian Integration says the recent proposal to split Russia into several parts, voiced by a Latvian MP, is proof of NATO\u2019s hostile plans.\u201dRT 2018-07-30",
  "text_b": "tampabay.Crisafulli: Why the Florida House opposes Medicaid expansion.tampa bay opinion columns cris full why the florida house opposes medicaid expansion.Crisafulli: Why the Florida House opposes Medicaid expansion In 2012, the U.S. Supreme Court gave states the option to expand Medicaid under Obamacare and Florida has chosen not to expand. There are principled reasons for declining to grow a program that currently covers 3.7 million Floridians at a cost of $23.5 billion per year, or about one-third of Florida's budget. We oppose expanding Medicaid because it is a broken system with poor health outcomes, high inflation, unseverable federal strings, and no incentive for personal responsibility for those who 