In [8]:
import pandas as pd
import numpy as np
import re
import time
import psutil
import shutil
import os
from transformers import pipeline
import json
import gzip
import bert_score
import glob
import wikipedia

In [9]:
def get_wiki_summary(url):
    
    if "wikipedia.org" not in url:
        return ""
    
    page_title = url.rpartition("/")[-1]
    print(url)
    page = wikipedia.page(title=page_title, auto_suggest=False)
    
    return page.summary
    
event_df = pd.read_json("CrisisFACTs-2022to2023.topics.json", lines=False).set_index("eventID")
event_df["wiki.summary"] = event_df["url"].apply(get_wiki_summary)

https://en.wikipedia.org/wiki/Lilac_Fire
https://en.wikipedia.org/wiki/Cranston_Fire
https://en.wikipedia.org/wiki/Holy_Fire_(2018)
https://en.wikipedia.org/wiki/Hurricane_Florence
https://en.wikipedia.org/wiki/2018_Maryland_flood
https://en.wikipedia.org/wiki/Saddleridge_Fire
https://en.wikipedia.org/wiki/Hurricane_Laura
https://en.wikipedia.org/wiki/Hurricane_Sally
https://en.wikipedia.org/wiki/2020_Beirut_explosion
https://en.wikipedia.org/wiki/2020_Houston_explosion
https://en.wikipedia.org/wiki/Edenville_Dam#Dam_failure
https://en.wikipedia.org/wiki/Hurricane_Dorian
https://en.wikipedia.org/wiki/Kincade_Fire
https://en.wikipedia.org/wiki/2020_Easter_tornado_outbreak
https://en.wikipedia.org/wiki/Tornado_outbreak_of_April_22-23,_2020
https://en.wikipedia.org/wiki/Tornado_outbreak_of_March_2-3,_2020


In [10]:
with open("final-annotated-facts-results.json", "r") as in_file:
    annotation_data = json.load(in_file)

In [16]:
for f in glob.glob("submissions/*.gz"):
    this_run_id = f.partition("/")[-1].replace(".gz", "")
    print(f, "-->", this_run_id)
    
    # Reset the summary for the current file
    local_event_summaries = {"CrisisFACTS-001": []}

    this_run_event_request_facts = {k: [] for k in all_req_ids}
    
    # Read and process the gzip file
    with gzip.open(f, "r") as in_file:
        for line_ in in_file:
            line = line_.decode("utf8")
            entry = json.loads(line)
            this_req_id = entry["requestID"]
            
            # Skip requests with no relevant facts
            if this_req_id not in all_req_ids:
                continue
            
            this_run_event_request_facts[this_req_id].append(entry)
    
    # Process facts only for 'CrisisFACTS-001'
    for event_request, this_fact_list in this_run_event_request_facts.items():
        event_id = event_request.rpartition("-")[0]

        if event_id != 'CrisisFACTS-001':
            continue

        sorted_fact_list = sorted(this_fact_list, key=lambda v: v["importance"], reverse=True)
        this_day_summary = [this_top_fact["factText"] for this_top_fact in sorted_fact_list[:TOP_K]]
        
        local_event_summaries["CrisisFACTS-001"] += this_day_summary
        
    # Compute metrics only for 'CrisisFACTS-001'
    event_id = "CrisisFACTS-001"
    this_submitted_summary = local_event_summaries[event_id]

    try:
        this_summary_text = ". ".join(this_submitted_summary).replace("..", ".")
    except Exception as e:
        print(f"Error processing summary for {event_id}: {e}")
        continue

    print(event_id, len(this_summary_text))
    
    # Retrieve the corresponding wiki summary
    wiki_summary = event_df.loc[event_id, "wiki.summary"]
    wiki_metric_ = bert_score.score([this_summary_text], [wiki_summary], model_type="roberta-large-mnli")

    wiki_metric = {
        "precision": wiki_metric_[0],
        "recall": wiki_metric_[1],
        "f1": wiki_metric_[2],
    }

    this_wiki_df = pd.DataFrame([{"metric": k, "value": v.item(), "event": event_id} for k, v in wiki_metric.items()])
    print(this_wiki_df)


submissions/my_submission_pega_detail.json.gz --> my_submission_pega_detail.json
CrisisFACTS-001 48424
      metric     value            event
0  precision  0.717838  CrisisFACTS-001
1     recall  0.743328  CrisisFACTS-001
2         f1  0.730361  CrisisFACTS-001
submissions/my_submission_bart_detail.json.gz --> my_submission_bart_detail.json
CrisisFACTS-001 42618
      metric     value            event
0  precision  0.720050  CrisisFACTS-001
1     recall  0.745512  CrisisFACTS-001
2         f1  0.732560  CrisisFACTS-001
