In [1]:
import pyterrier as pt
import os
import pandas as pd
import psutil
import time
import pandas as pd
import requests

In [2]:
data = [
    ("001", "Lilac Wildfire 2017"),
    ("002", "Cranston Wildfire 2018"),
    ("003", "Holy Wildfire 2018"),
    ("004", "Hurricane Florence 2018"),
    ("005", "2018 Maryland Flood"),
    ("006", "Saddleridge Wildfire 2019"),
    ("007", "Hurricane Laura 2020"),
    ("008", "Hurricane Sally 2020"),
    ("009", "Beirut Explosion 2020"),
    ("010", "Houston Explosion 2020"),
    ("011", "Rutherford TN Floods 2020"),
    ("012", "TN Derecho 2020"),
    ("013", "Edenville Dam Fail 2020"),
    ("014", "Hurricane Dorian 2019"),
    ("015", "Kincade Wildfire 2019"),
    ("016", "Easter Tornado Outbreak 2020"),
    ("017", "Tornado Outbreak April 2020"),
    ("018", "Tornado Outbreak March 2020"),
]

# Create DataFrame
event_df = pd.DataFrame(data, columns=["ID", "EventName"])

In [3]:
def get_eventsMeta(eventNoList='001', days=100):
    def getDaysForEventNo(eventNo):

        # We will download a file containing the day list for an event
        url = "http://trecis.org/CrisisFACTs/CrisisFACTS-"+eventNo+".requests.json"

        # Download the list and parse as JSON
        dayList = requests.get(url).json()

        return dayList

    eventNoList = eventNoList.split(',')

    eventsMeta = {}

    for eventNo in eventNoList: # for each event
        
        dailyInfo = getDaysForEventNo(eventNo) # get the list of days
        eventsMeta[eventNo]= dailyInfo[:days]

    return eventsMeta

In [4]:
event_list = '001,002,003,004,005,006,007,008,009,010,011,012,013,014,015,016,017,018'

eventsMeta = get_eventsMeta(eventNoList=event_list)

In [5]:
os.environ['IR_DATASETS_HOME'] = '../'

dataset_df = pd.DataFrame()
queries_df = pd.DataFrame()

for eventId, dailyInfo in eventsMeta.items():

    queries = pd.read_csv(f'../crisisfacts/{eventId}.csv')
    queries['ID'] = eventId
    queries_df = pd.concat([queries_df, queries], ignore_index=True)

    for thisDay in dailyInfo:
        ir_dataset_id = "crisisfacts/%s/%s" % (eventId, thisDay["dateString"])
        # print(ir_dataset_id, " processing")  

        pyTerrierDataset = pt.get_dataset(f'irds:{ir_dataset_id}')


        # dataset = pd.DataFrame(pyTerrierDataset.get_corpus_iter(), columns=['docno', 'text', 'unix_timestamp'])
        dataset = pd.DataFrame(pyTerrierDataset.get_corpus_iter())
        dataset['ID'] = eventId
        dataset['date'] = thisDay['dateString']

        dataset_df = pd.concat([dataset_df, dataset], ignore_index=True)

dataset_df = dataset_df.merge(event_df, on='ID')
queries_df = queries_df.merge(event_df, on='ID')

crisisfacts/001/2017-12-07 documents: 7288it [00:00, 13481.43it/s]
crisisfacts/001/2017-12-08 documents: 19231it [00:01, 15339.66it/s]
crisisfacts/001/2017-12-09 documents: 5839it [00:00, 13343.64it/s]
crisisfacts/001/2017-12-10 documents: 4407it [00:00, 11839.50it/s]
crisisfacts/001/2017-12-11 documents: 3394it [00:00, 7684.72it/s]
crisisfacts/001/2017-12-12 documents: 2805it [00:00, 10584.43it/s]
crisisfacts/001/2017-12-13 documents: 2658it [00:00, 14138.41it/s]
crisisfacts/001/2017-12-14 documents: 2728it [00:00, 24025.83it/s]
crisisfacts/001/2017-12-15 documents: 2665it [00:00, 5439.83it/s]
crisisfacts/002/2018-07-25 documents: 5056it [00:00, 32410.53it/s]
crisisfacts/002/2018-07-26 documents: 7866it [00:00, 10151.40it/s]
crisisfacts/002/2018-07-27 documents: 7433it [00:00, 37735.89it/s]
crisisfacts/002/2018-07-28 documents: 5238it [00:00, 19186.96it/s]
crisisfacts/002/2018-07-29 documents: 4691it [00:00, 6142.05it/s] 
crisisfacts/002/2018-07-30 documents: 251it [00:00, 4048.48it/s

In [42]:
dataset_df.shape

(1955832, 8)

In [43]:
queries_df.shape

(922, 13)

In [44]:
dataset_df.head()

Unnamed: 0,event,text,source,source_type,unix_timestamp,docno,ID,date
0,CrisisFACTS-001,Live updates: San Diego County fire is 92 perc...,{'url': 'http://www.sandiegouniontribune.com/n...,News,1512604800,CrisisFACTS-001-News-5-0,1,2017-12-07
1,CrisisFACTS-001,"The Lilac fire now 92 percent contained, Cal F...",{'url': 'http://www.sandiegouniontribune.com/n...,News,1512604800,CrisisFACTS-001-News-5-1,1,2017-12-07
2,CrisisFACTS-001,The county of San Diego has opened a Local Ass...,{'url': 'http://www.sandiegouniontribune.com/n...,News,1512604800,CrisisFACTS-001-News-5-2,1,2017-12-07
3,CrisisFACTS-001,The center is at the Vista branch library on 7...,{'url': 'http://www.sandiegouniontribune.com/n...,News,1512604800,CrisisFACTS-001-News-5-3,1,2017-12-07
4,CrisisFACTS-001,Homeowners also will be able to get informatio...,{'url': 'http://www.sandiegouniontribune.com/n...,News,1512604800,CrisisFACTS-001-News-5-4,1,2017-12-07


In [45]:
queries_df.head()

Unnamed: 0,query_id,text,indicative_terms,trecis_category_mapping,event_id,event_title,event_dataset,event_description,event_trecis_id,event_type,event_url,ID,EventName
0,CrisisFACTS-General-q001,Have airports closed,airport closed,Report-Factoid,CrisisFACTS-001,Lilac Wildfire 2017,2017_12_07_lilac_wildfire.2017,The Lilac Fire was a fire that burned in north...,TRECIS-CTIT-H-092,Wildfire,https://en.wikipedia.org/wiki/Lilac_Fire,1,Lilac Wildfire 2017
1,CrisisFACTS-General-q002,Have railways closed,rail closed,Report-Factoid,CrisisFACTS-001,Lilac Wildfire 2017,2017_12_07_lilac_wildfire.2017,The Lilac Fire was a fire that burned in north...,TRECIS-CTIT-H-092,Wildfire,https://en.wikipedia.org/wiki/Lilac_Fire,1,Lilac Wildfire 2017
2,CrisisFACTS-General-q003,Have water supplied been contaminated,water supply,Report-EmergingThreats,CrisisFACTS-001,Lilac Wildfire 2017,2017_12_07_lilac_wildfire.2017,The Lilac Fire was a fire that burned in north...,TRECIS-CTIT-H-092,Wildfire,https://en.wikipedia.org/wiki/Lilac_Fire,1,Lilac Wildfire 2017
3,CrisisFACTS-General-q004,How many firefighters are active,firefighters on-duty,Report-Factoid,CrisisFACTS-001,Lilac Wildfire 2017,2017_12_07_lilac_wildfire.2017,The Lilac Fire was a fire that burned in north...,TRECIS-CTIT-H-092,Wildfire,https://en.wikipedia.org/wiki/Lilac_Fire,1,Lilac Wildfire 2017
4,CrisisFACTS-General-q005,How many people are affected,evacuated,Report-Factoid,CrisisFACTS-001,Lilac Wildfire 2017,2017_12_07_lilac_wildfire.2017,The Lilac Fire was a fire that burned in north...,TRECIS-CTIT-H-092,Wildfire,https://en.wikipedia.org/wiki/Lilac_Fire,1,Lilac Wildfire 2017
