In [79]:
import pandas as pd
import numpy as np
import re
import time
import psutil
import shutil
import os
from transformers import pipeline
import json
import gzip

In [80]:
df = pd.read_csv("./result_df_before_summary.csv")
df['date'] = pd.to_datetime(df['unix_timestamp'], unit='s').dt.date
df1 = df[df['request_id'].str.startswith("CrisisFACTS-001-")]

bart_summarizer = pipeline("summarization", model="facebook/bart-large-xsum")
pega_summarizer = pipeline("summarization", model="google/pegasus-xsum")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [81]:
from joblib import Parallel, delayed
import multiprocessing

# Set up parallel processing
num_cores = multiprocessing.cpu_count()
num_batches = num_cores * 2

# Split the dataset into batches
batches = np.array_split(df1['texts'], num_batches)

# Function to clean and summarize text
def clean_and_summarize(text, summarizer):
    # Remove hashtags, URLs, and mentions
    cleaned_text = re.sub(r'#\S+|https?://\S+|@\S+', '', text)
    # Generate summary
    summary = summarizer(cleaned_text, max_length=20, min_length=10)
    return summary[0]['summary_text']

# Wrapper function for batch processing
def process_batch(texts, summarizer):
    return [clean_and_summarize(text, summarizer) for text in texts]


def summarize(summarizer):
    # Apply parallel processing
    results = Parallel(n_jobs=num_cores)(
        delayed(process_batch)(batch.tolist(), summarizer) for batch in batches
    )

    summaries = [summary for batch_result in results for summary in batch_result]
    return summaries

  return bound(*args, **kwds)


In [86]:
import openai

api = os.getenv("OPENAI_API_KEY")

openai.api_key = api

process = psutil.Process(os.getpid())  # Get current process
start_memory = process.memory_info().rss  # Memory usage at start (in bytes)
start_time = time.time()  # Start time

answer_output = []
for i, row in df.iterrows():
    question = str(row['question'] + "?")
    provided_text = row['texts']

    prompt = f"""
    You are a helpful assistant. Answer the question based only on the text provided below. 
    If no answers can be found at all, return "unanswerable"

    Don't make the responses conversational.
    Expressions like hundreds of thousands can be answers to questions asking how many or how much.
    Do not line break the text and just give me the output.

    Text:
    {provided_text}

    Question:
    {question}
    """

    client = openai.OpenAI()

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=150
    )
    answer = response.choices[0].message.content
    answer_output.append(answer)
    # Print progress every 10 loops
    if (i + 1) % 50 == 0:
        print(f"Processed {i + 1} rows")

# Calculate runtime and memory usage
end_time = time.time()  # End time
end_memory = process.memory_info().rss  # Memory usage at end (in bytes)
runtime = end_time - start_time
memory_used = (end_memory - start_memory) / 1024 / 1024  # Convert bytes to MB

print(runtime)
print(memory_used)

187.6734289481294
36.46804


In [83]:
process = psutil.Process(os.getpid())  # Get current process
start_memory = process.memory_info().rss  # Memory usage at start (in bytes)
start_time = time.time()  # Start time

pega_summary = summarize(pega_summarizer)

# Calculate runtime and memory usage
end_time = time.time()  # End time
end_memory = process.memory_info().rss  # Memory usage at end (in bytes)
runtime = end_time - start_time
memory_used = (end_memory - start_memory) / 1024 / 1024  # Convert bytes to MB

print(runtime)
print(memory_used)

Your max_length is set to 20, but your input_length is only 19. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)
Your max_length is set to 20, but your input_length is only 16. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)
Your max_length is set to 20, but your input_length is only 16. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)
Your max_length is set to 20, but your input_length is only 19. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)
Your max_len

613.3333511352539
340.77734375


In [82]:
process = psutil.Process(os.getpid())  # Get current process
start_memory = process.memory_info().rss  # Memory usage at start (in bytes)
start_time = time.time()  # Start time

bart_summary = summarize(bart_summarizer)

# Calculate runtime and memory usage
end_time = time.time()  # End time
end_memory = process.memory_info().rss  # Memory usage at end (in bytes)
runtime = end_time - start_time
memory_used = (end_memory - start_memory) / 1024 / 1024  # Convert bytes to MB

print(runtime)
print(memory_used)

Your max_length is set to 20, but your input_length is only 17. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)
Your max_length is set to 20, but your input_length is only 19. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)


427.0489273071289
2.078125


In [67]:
df_gpt = df.copy()
df_pega = df.copy()
df_bart = df.copy()

df_gpt["summary"] = None
df_pega["summary"] = None
df_bart["summary"] = None

df_gpt.iloc[0:434, df_gpt.columns.get_loc("summary")] = answer_output
df_pega.iloc[0:434, df_pega.columns.get_loc("summary")] = pega_summary
df_bart.iloc[0:434, df_bart.columns.get_loc("summary")] = bart_summary

In [85]:
with open("./submission_json/my_submission_pega_detail.json", 'w') as f:
    for index, row in df_pega.iterrows():
        fact_ = {
            "requestID": row['request_id'],
            "factText": row['summary'],
            "unixTimestamp": int(row['unix_timestamp']),
            "importance": float(row['avg_importance']),
            "sources": row['docno_list'],
            "streamID": None,
            "informationNeeds": row['q_id']
        }
        
        # Write each dictionary as a separate JSON object on a new line
        json.dump(fact_, f)
        f.write("\n")  # Add a newline after each JSON object

with open("./submission_json/my_submission_bart_detail.json", 'w') as f:
    for index, row in df_bart.iterrows():
        fact_ = {
            "requestID": row['request_id'],
            "factText": row['summary'],
            "unixTimestamp": int(row['unix_timestamp']),
            "importance": float(row['avg_importance']),
            "sources": row['docno_list'],
            "streamID": None,
            "informationNeeds": row['q_id']
        }
        
        # Write each dictionary as a separate JSON object on a new line
        json.dump(fact_, f)
        f.write("\n")  # Add a newline after each JSON object

with open("./submission_json/my_submission_gpt_detail.json", 'w') as f:
    for index, row in df_gpt.iterrows():
        fact_ = {
            "requestID": row['request_id'],
            "factText": row['summary'],
            "unixTimestamp": int(row['unix_timestamp']),
            "importance": float(row['avg_importance']),
            "sources": row['docno_list'],
            "streamID": None,
            "informationNeeds": row['q_id']
        }
        
        # Write each dictionary as a separate JSON object on a new line
        json.dump(fact_, f)
        f.write("\n")  # Add a newline after each JSON object

In [84]:
input_folder = "./submission_json"
output_folder = "./submissions"

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Loop through all files in the input folder
for file_name in os.listdir(input_folder):
    input_file_path = os.path.join(input_folder, file_name)
    output_file_path = os.path.join(output_folder, f"{file_name}.gz")

    # Check if the current item is a file (not a folder)
    if os.path.isfile(input_file_path):
        # Open the input file and compress it into the output folder
        with open(input_file_path, "rb") as f_in:
            with gzip.open(output_file_path, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)
        print(f"Compressed: {input_file_path} -> {output_file_path}")

Compressed: ./submission_json/my_submission_gpt_detail.json -> ./submissions/my_submission_gpt_detail.json.gz
Compressed: ./submission_json/my_submission_bart_detail.json -> ./submissions/my_submission_bart_detail.json.gz
Compressed: ./submission_json/my_submission_pega_detail.json -> ./submissions/my_submission_pega_detail.json.gz
