In [7]:
# Event numbers as a list
eventNoList = [
    "001", # Lilac Wildfire 2017
    "002", # Cranston Wildfire 2018
    "003", # Holy Wildfire 2018
    "004", # Hurricane Florence 2018
    "005", # 2018 Maryland Flood
    "006", # Saddleridge Wildfire 2019
    "007", # Hurricane Laura 2020
    "008", # Hurricane Sally 2020
    "009", # Beirut Explosion, 2020
    "010", # Houston Explosion, 2020
    "011", # Rutherford TN Floods, 2020
    "012", # TN Derecho, 2020
    "013", # Edenville Dam Fail, 2020
    "014", # Hurricane Dorian, 2019
    "015", # Kincade Wildfire, 2019
    "016", # Easter Tornado Outbreak, 2020
    "017", # Tornado Outbreak, 2020 Apr
    "018", # Tornado Outbreak, 2020 March
]


eventNoList = [
    "001"
]

In [14]:
event_input = '001,002'
eventNoList = event_input.split(',')

In [28]:
import requests
import ir_datasets
import pandas as pd

# Gets the list of days for a specified event number, e.g. '001'
def getDaysForEventNo(eventNo):

    # We will download a file containing the day list for an event
    url = "http://trecis.org/CrisisFACTs/CrisisFACTS-"+eventNo+".requests.json"

    # Download the list and parse as JSON
    dayList = requests.get(url).json()

    return dayList

def get_eventsMeta(eventNoList, days):

    eventsMeta = {}

    for eventNo in eventNoList: # for each event
        
        dailyInfo = getDaysForEventNo(eventNo) # get the list of days
        eventsMeta[eventNo]= dailyInfo[:days]
    
    
        
        # print("Event "+eventNo)
        # for day in dailyInfo: # for each day
        #     print("  crisisfacts/"+eventNo+"/"+day["dateString"], "-->", day["requestID"]) # construct the request string

        # print()
    return eventsMeta

In [30]:
eventsMeta = get_eventsMeta(eventNoList, 1)

In [None]:
credentials = {
    "institution": "Georgetown University", # University, Company or Public Agency Name
    "contactname": "JaeHo Bahng", # Your Name
    "email": "jaheo127@gmail.com", # A contact email address
    "institutiontype": "Academic" # Either 'Research', 'Industry', or 'Public Sector'
}

# Write this to a file so it can be read when needed
import json

with open('../../auth/crisisfacts.json', 'w') as f:
    json.dump(credentials, f)

In [None]:
import pyterrier as pt
# from pyterrier_t5 import MonoT5ReRanker, DuoT5ReRanker
import os
import pandas as pd
import psutil
import time
from rerankers import Reranker
import openai
from dotenv import load_dotenv
# Load variables from the .env file
load_dotenv('../../.env')
api = os.getenv("OPENAI_API_KEY")

models = [
    "BM25",          # Okapi BM25
    "TF_IDF",        # Term Frequency - Inverse Document Frequency
    "PL2",           # Divergence from Randomness model
    "InL2",          # Inverse document length normalized
    "DPH",           # Divergence from Randomness - DPH
    "DirichletLM",   # Language Model with Dirichlet smoothing
    "Hiemstra_LM",   # Hiemstra Language Model
    "DFRee",         # Divergence-Free model
]

os.environ["PT_NO_PROGRESS"] = "1"
os.environ['IR_DATASETS_HOME'] = '../../'

class crisis:
    def __init__(self, events):
        self.eventsMeta = events

    def rank_rerank_colbert(self, model = 'BM25'):
        process = psutil.Process(os.getpid())  # Get current process
        start_memory = process.memory_info().rss  # Memory usage at start (in bytes)
        start_time = time.time()  # Start time

        final_df = pd.DataFrame()
        ranker = Reranker("answerdotai/answerai-colbert-small-v1", model_type='colbert')

        for eventId, dailyInfo in self.eventsMeta.items():
            for thisDay in dailyInfo:
                try:
                    ir_dataset_id = "crisisfacts/%s/%s" % (eventId, thisDay["dateString"])
                    print(ir_dataset_id, " processing")  
        
                    pyTerrierDataset = pt.get_dataset(f'irds:{ir_dataset_id}')
                    queries = pyTerrierDataset.get_topics()
                    dataset = pd.DataFrame(pyTerrierDataset.get_corpus_iter(), columns=['docno', 'text', 'unix_timestamp'])
        
                    indexer = pt.IterDictIndexer("None", type=pt.index.IndexingType(3), meta=["docno", "text"], meta_lengths=[0, 200])
                    index = indexer.index(pyTerrierDataset.get_corpus_iter())
                        
                    retriever = pt.terrier.Retriever(index, wmodel=model, metadata=["docno", "text"])
                    retriever.setControl("termpipelines", "Stopwords,PorterStemmer")
        
                    for _, row in queries.iterrows():
                        # matching_index = int(queries[queries['indicative_terms'] == row['indicative_terms']].index[0])
                        # print(ir_dataset_id, "query num : ", matching_index)

                        retriever_df = pd.DataFrame(retriever.search(row['indicative_terms']))
                        retriever_df = retriever_df[~retriever_df['text'].isnull()]
                        retriever_df = retriever_df[retriever_df['rank']<50]
                        retriever_df['docid'] = retriever_df['docid'].astype(int)
        
        
                        retriever_df['Event'] = eventId
                        retriever_df['request_id'] = thisDay['requestID']
                        retriever_df['date'] = thisDay['dateString']
                        retriever_df['q_id'] = row['qid']
                        retriever_df['question'] = row['text']
        
        
                        if not retriever_df.empty:
                            # Rerank
                            result = ranker.rank(query=row['indicative_terms'], docs=retriever_df['text'], doc_ids=retriever_df['docid'])
                            
                            rereank_score = [i.score for i in result.results]
                            rerank_rank = [i.rank for i in result.results]
                            rerank_doc = [i.doc_id for i in result.results]
        
                            # Creating a DataFrame
                            df = pd.DataFrame({
                                'rerank_score': rereank_score,
                                'rerank_rank': rerank_rank,
                                'rerank_doc': rerank_doc
                            })
        
                            retriever_df = retriever_df.merge(df, left_on='docid', right_on='rerank_doc', how='left')
        
                            retriever_df = retriever_df[retriever_df['rerank_rank']<=5]
        
                            #Clean
                            result_df = retriever_df.sort_values('rerank_rank', ascending=True).reset_index(drop=True)
                            result_df = result_df.merge(dataset[['docno', 'unix_timestamp']], on='docno', how='left')
        
                            # Append to final_df
                            final_df = pd.concat([final_df, result_df], ignore_index=True)
                except:
                    continue

        final_df['formatted_datetime'] = pd.to_datetime(final_df['unix_timestamp'], unit='s')

        min_max = (
            final_df.groupby(['request_id'])
            .agg(
                min=('rerank_score','min'),
                max=('rerank_score', 'max')
            )
            .reset_index()
        )

        final_df = final_df.merge(min_max, on='request_id', how='left')
        final_df['importance'] = (final_df['rerank_score'] - final_df['min']) / (final_df['max'] - final_df['min'])

        # Calculate runtime and memory usage
        end_time = time.time()  # End time
        end_memory = process.memory_info().rss  # Memory usage at end (in bytes)
        runtime = end_time - start_time
        memory_used = (end_memory - start_memory) / 1024 / 1024  # Convert bytes to MB

        return final_df, runtime, memory_used


    def group_doc(self, df):
        result_df = (
            df.groupby(['request_id', 'q_id'])
            .agg(
                texts=('text', ' '.join),                     # Join all text values into a single string
                docno_list=('docno', list),                   # Collect docno values in a list
                avg_importance=('importance', 'mean'),        # Calculate the average importance
                unix_timestamp =('unix_timestamp', 'min'),
                question = ('question', 'min'),
                query = ('query','min')
            )
            .reset_index()                                    # Reset index for a clean DataFrame
        )
        return result_df


    def gpt_summary(self, df, api):
        # Set your OpenAI API key
        openai.api_key = api

        process = psutil.Process(os.getpid())  # Get current process
        start_memory = process.memory_info().rss  # Memory usage at start (in bytes)
        start_time = time.time()  # Start time

        answer_output = []
        for i, row in df.iterrows():
            question = str(row['question'] + "?")
            provided_text = row['texts']

            prompt = f"""
            You are a helpful assistant. Answer the question based only on the text provided below. 
            If no answers can be found at all, return "unanswerable"

            Don't make the responses conversational.
            Expressions like hundreds of thousands can be answers to questions asking how many or how much.
            Do not line break the text and just give me the output.

            Text:
            {provided_text}

            Question:
            {question}
            """

            client = openai.OpenAI()

            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0,
                max_tokens=150
            )
            answer = response.choices[0].message.content
            answer_output.append(answer)
            # Print progress every 10 loops
            # if (i + 1) % 50 == 0:
            #     print(f"Processed {i + 1} rows")

        df['summary'] = answer_output

        end_time = time.time()  # End time
        end_memory = process.memory_info().rss  # Memory usage at end (in bytes)

        # Calculate metrics
        runtime = end_time - start_time
        memory_used = (end_memory - start_memory) / 1024 / 1024  # Convert bytes to MB

        # Return results and performance metrics
        return df, runtime, memory_used



In [41]:
mine = crisis(events = eventsMeta)

final_df, time_taken, memory_used = mine.rank_rerank_colbert(model = 'BM25')

Loading ColBERTRanker model answerdotai/answerai-colbert-small-v1 (this message can be suppressed by setting verbose=0)
No device set
Using device cuda
No dtype set
Using dtype torch.float32
Loading model answerdotai/answerai-colbert-small-v1, this might take a while...
Linear Dim set to: 96 for downcasting
crisisfacts/001/2017-12-07  processing
There are multiple query fields available: ('text', 'indicative_terms', 'trecis_category_mapping', 'event_id', 'event_title', 'event_dataset', 'event_description', 'event_trecis_id', 'event_type', 'event_url'). To use with pyterrier, provide variant or modify dataframe to add query column.


crisisfacts/001/2017-12-07 documents: 7288it [00:00, 14802.68it/s]
crisisfacts/001/2017-12-07 documents: 7288it [00:01, 4660.61it/s]


crisisfacts/002/2018-07-25  processing
There are multiple query fields available: ('text', 'indicative_terms', 'trecis_category_mapping', 'event_id', 'event_title', 'event_dataset', 'event_description', 'event_trecis_id', 'event_type', 'event_url'). To use with pyterrier, provide variant or modify dataframe to add query column.


crisisfacts/002/2018-07-25 documents: 5056it [00:00, 38269.86it/s]
crisisfacts/002/2018-07-25 documents: 5056it [00:01, 4754.96it/s]


In [42]:
result_df = mine.group_doc(final_df)

In [None]:
a, b, c = mine.gpt_summary(result_df, api)

In [None]:
from crisis_summary.crisis_summary import crisis