In [3]:
import pandas as pd
import numpy as np
import re
import time
import psutil
import shutil
import os
from transformers import pipeline
import json
import gzip

In [4]:
# Define the input and output folder paths
input_folder = "./submission_json"
output_folder = "./submissions"

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Loop through all files in the input folder
for file_name in os.listdir(input_folder):
    input_file_path = os.path.join(input_folder, file_name)
    output_file_path = os.path.join(output_folder, f"{file_name}.gz")

    # Check if the current item is a file (not a folder)
    if os.path.isfile(input_file_path):
        # Open the input file and compress it into the output folder
        with open(input_file_path, "rb") as f_in:
            with gzip.open(output_file_path, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)
        print(f"Compressed: {input_file_path} -> {output_file_path}")

Compressed: ./submission_json/my_submission_bart_detail.json -> ./submissions/my_submission_bart_detail.json.gz


In [5]:
import glob
import scipy.stats
import matplotlib.pyplot as plt
import wikipedia
from torchmetrics.text.rouge import ROUGEScore

def get_wiki_summary(url):
    
    if "wikipedia.org" not in url:
        return ""
    
    page_title = url.rpartition("/")[-1]
    print(url)
    page = wikipedia.page(title=page_title, auto_suggest=False)
    
    return page.summary
event_df = pd.read_json("CrisisFACTs-2022to2023.topics.json", lines=False).set_index("eventID")
event_df["wiki.summary"] = event_df["url"].apply(get_wiki_summary)

https://en.wikipedia.org/wiki/Lilac_Fire
https://en.wikipedia.org/wiki/Cranston_Fire
https://en.wikipedia.org/wiki/Holy_Fire_(2018)
https://en.wikipedia.org/wiki/Hurricane_Florence
https://en.wikipedia.org/wiki/2018_Maryland_flood
https://en.wikipedia.org/wiki/Saddleridge_Fire
https://en.wikipedia.org/wiki/Hurricane_Laura
https://en.wikipedia.org/wiki/Hurricane_Sally
https://en.wikipedia.org/wiki/2020_Beirut_explosion
https://en.wikipedia.org/wiki/2020_Houston_explosion
https://en.wikipedia.org/wiki/Edenville_Dam#Dam_failure
https://en.wikipedia.org/wiki/Hurricane_Dorian
https://en.wikipedia.org/wiki/Kincade_Fire
https://en.wikipedia.org/wiki/2020_Easter_tornado_outbreak
https://en.wikipedia.org/wiki/Tornado_outbreak_of_April_22-23,_2020
https://en.wikipedia.org/wiki/Tornado_outbreak_of_March_2-3,_2020


In [7]:
import bert_score

In [8]:
with open("final-annotated-facts-results.json", "r") as in_file:
    annotation_data = json.load(in_file)

In [13]:
submission_metrics = {}
TOP_K = 32
all_req_ids = list(annotation_data.keys())

for f in glob.glob("submissions/*.gz"):
    
    this_run_id = f.partition("/")[-1].replace(".gz", "")
    print(f, "-->", this_run_id)
    
    this_run_event_request_facts = {k:[] for k in all_req_ids}
    
    with gzip.open(f, "r") as in_file:
        for line_ in in_file:
            line = line_.decode("utf8")
            
            entry = json.loads(line)
            this_req_id = entry["requestID"]
            
            # We skip days where we have no relevant facts from assessors
            if this_req_id not in all_req_ids:
                continue
            
            this_run_event_request_facts[this_req_id].append(entry)
            
    local_event_summaries = {e_id:[] for e_id in event_df.index}
    for event_request,this_fact_list in this_run_event_request_facts.items():
        event_id = event_request.rpartition("-")[0]
        
        sorted_fact_list = sorted(this_fact_list, key=lambda v: v["importance"], reverse=True)
        this_day_summary = [this_top_fact["factText"] for this_top_fact in sorted_fact_list[:TOP_K]]

        
        local_event_summaries[event_id] = local_event_summaries[event_id] + this_day_summary
        
    wiki_dfs = []

    for event_id,event in event_df.iterrows():
        
        this_submitted_summary = local_event_summaries[event_id]

        try:
            this_summary_text = ". ".join(this_submitted_summary).replace("..", ".")
        except:
            pass
        print(event_id, len(this_summary_text))
        
        wiki_summary = event["wiki.summary"]
        wiki_metric_ = bert_score.score([this_summary_text], [wiki_summary], model_type="roberta-large-mnli")

        wiki_metric = {
            "precision": wiki_metric_[0],
            "recall": wiki_metric_[1],
            "f1": wiki_metric_[2],
        }

        this_wiki_df = pd.DataFrame([{"metric":k, "value":v.item(), "event": event_id} for k,v in wiki_metric.items()])

        wiki_dfs.append(this_wiki_df)

        
    full_wiki_df = pd.concat(wiki_dfs)

    submission_metrics[this_run_id] = {
        "wiki": full_wiki_df
    }
    
    display(full_wiki_df.groupby("metric")['value'].mean())

submissions/my_submission_bart_detail.json.gz --> my_submission_bart_detail.json
CrisisFACTS-001 42618
CrisisFACTS-002 42618
CrisisFACTS-003 42618
CrisisFACTS-004 42618
CrisisFACTS-005 42618
CrisisFACTS-006 42618
CrisisFACTS-007 42618
CrisisFACTS-008 42618
CrisisFACTS-009 42618
CrisisFACTS-010 42618
CrisisFACTS-011 42618




CrisisFACTS-012 42618




CrisisFACTS-013 42618
CrisisFACTS-014 42618
CrisisFACTS-015 42618
CrisisFACTS-016 42618
CrisisFACTS-017 42618
CrisisFACTS-018 42618


metric
f1           0.628951
precision    0.623133
recall       0.635148
Name: value, dtype: float64

In [15]:
submission_metrics

{'my_submission_bart_detail.json': {'wiki':       metric     value            event
  0  precision  0.720050  CrisisFACTS-001
  1     recall  0.745512  CrisisFACTS-001
  2         f1  0.732560  CrisisFACTS-001
  0  precision  0.705777  CrisisFACTS-002
  1     recall  0.743536  CrisisFACTS-002
  2         f1  0.724165  CrisisFACTS-002
  0  precision  0.704892  CrisisFACTS-003
  1     recall  0.738438  CrisisFACTS-003
  2         f1  0.721275  CrisisFACTS-003
  0  precision  0.702079  CrisisFACTS-004
  1     recall  0.679659  CrisisFACTS-004
  2         f1  0.690687  CrisisFACTS-004
  0  precision  0.696601  CrisisFACTS-005
  1     recall  0.724576  CrisisFACTS-005
  2         f1  0.710313  CrisisFACTS-005
  0  precision  0.713770  CrisisFACTS-006
  1     recall  0.741214  CrisisFACTS-006
  2         f1  0.727233  CrisisFACTS-006
  0  precision  0.703186  CrisisFACTS-007
  1     recall  0.698702  CrisisFACTS-007
  2         f1  0.700937  CrisisFACTS-007
  0  precision  0.702774  CrisisFA