In [10]:
import pandas as pd
import numpy as np
import re
import time
import psutil
import shutil
import os
from transformers import pipeline
import json
import gzip

In [2]:
df = pd.read_csv("./result_df_before_summary.csv")
df['date'] = pd.to_datetime(df['unix_timestamp'], unit='s').dt.date
dff = df[["request_id","query","question","date","texts"]]
df1 = dff[dff['request_id'].str.startswith("CrisisFACTS-001-")]

bart_summarizer = pipeline("summarization", model="facebook/bart-large-xsum")
pega_summarizer = pipeline("summarization", model="google/pegasus-xsum")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from joblib import Parallel, delayed
import multiprocessing

# Set up parallel processing
num_cores = multiprocessing.cpu_count()
num_batches = num_cores * 2

# Split the dataset into batches
batches = np.array_split(df1['texts'], num_batches)

# Function to clean and summarize text
def clean_and_summarize(text, summarizer):
    # Remove hashtags, URLs, and mentions
    cleaned_text = re.sub(r'#\S+|https?://\S+|@\S+', '', text)
    # Generate summary
    summary = summarizer(cleaned_text, max_length=52, min_length=30)
    return summary[0]['summary_text']

# Wrapper function for batch processing
def process_batch(texts, summarizer):
    return [clean_and_summarize(text, summarizer) for text in texts]

# Apply parallel processing
results = Parallel(n_jobs=num_cores)(
    delayed(process_batch)(batch.tolist(), pega_summarizer) for batch in batches
)

# Combine results into the DataFrame
df1['pega_summary'] = [summary for batch_result in results for summary in batch_result]


  return bound(*args, **kwds)
Your max_length is set to 52, but your input_length is only 19. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)
Your max_length is set to 52, but your input_length is only 38. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)
Your max_length is set to 52, but your input_length is only 22. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)
Your max_length is set to 52, but your input_length is only 21. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('

In [5]:
df["summary"] = df1["pega_summary"]

In [6]:
# df.to_csv('pega.csv', index=False)

In [7]:
with open("./submission_json/my_submission_pega_detail.json", 'w') as f:
    for index, row in df.iterrows():
        fact_ = {
            "requestID": row['request_id'],
            "factText": row['summary'],
            "unixTimestamp": int(row['unix_timestamp']),
            "importance": float(row['avg_importance']),
            "sources": row['docno_list'],
            "streamID": None,
            "informationNeeds": row['q_id']
        }
        
        # Write each dictionary as a separate JSON object on a new line
        json.dump(fact_, f)
        f.write("\n")  # Add a newline after each JSON object

In [9]:
input_folder = "./submission_json"
output_folder = "./submissions"

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Loop through all files in the input folder
for file_name in os.listdir(input_folder):
    input_file_path = os.path.join(input_folder, file_name)
    output_file_path = os.path.join(output_folder, f"{file_name}.gz")

    # Check if the current item is a file (not a folder)
    if os.path.isfile(input_file_path):
        # Open the input file and compress it into the output folder
        with open(input_file_path, "rb") as f_in:
            with gzip.open(output_file_path, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)
        print(f"Compressed: {input_file_path} -> {output_file_path}")

Compressed: ./submission_json/my_submission_bart_detail.json -> ./submissions/my_submission_bart_detail.json.gz
Compressed: ./submission_json/my_submission_pega_detail.json -> ./submissions/my_submission_pega_detail.json.gz
