In [1]:
import pandas as pd
from transformers import pipeline
import math
from tqdm import tqdm

# Load the CSV file
input_file = 'scraped_processors.csv'  # Replace with your input CSV file path
output_file = 'output2.csv'  # Output CSV file path

# Read the CSV file into a DataFrame
df = pd.read_csv(input_file, encoding='ISO-8859-1')

# Initialize the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Function to chunk text
def chunk_text(text, max_length=1024):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word in words:
        current_length += len(word) + 1  # +1 for the space
        if current_length > max_length:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = len(word) + 1
        else:
            current_chunk.append(word)
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

# Function to summarize content
def summarize_text(text):
    if not isinstance(text, str) or len(text) == 0:
        return ""
    
    chunks = chunk_text(text)
    summarized_chunks = []
    
    for chunk in chunks:
        input_length = len(chunk.split())
        max_length = min(math.ceil(input_length * 0.3), 150)
        min_length = min(max_length - 10, 30)  # Ensure min_length is always less than max_length
        
        if max_length <= 10:  # If the chunk is too small to summarize meaningfully
            summarized_chunks.append(chunk)
        else:
            summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
            summarized_chunks.append(summary[0]['summary_text'])
    
    final_summary = " ".join(summarized_chunks)
    return final_summary

# Apply the summarization function to each row in the DataFrame with progress tracking
summaries = []
for idx, row in tqdm(df.iterrows(), total=df.shape[0], desc="Summarizing content"):
    processor = row['Processor']
    source = row['Source']
    content = row['Content']
    summary = summarize_text(content)
    summaries.append({'Processor': processor, 'Source': source, 'Summary': summary})

# Create a DataFrame with the summaries
output_df = pd.DataFrame(summaries)

# Save the summarized content into a new CSV file
output_df.to_csv("summarized_processors.csv", index=False, encoding='utf-8')

print("Summarization complete and saved to", output_file)

Summarizing content: 100%|██████████| 44/44 [56:02<00:00, 76.43s/it] 

Summarization complete and saved to output2.csv



