### Summarizer Demo

In [1]:
from transformers import pipeline
import json
from datetime import datetime, timedelta

  from .autonotebook import tqdm as notebook_tqdm


### Prepare the data

In [2]:
RAW_DATA = "./test_data_raw.json"
NORMALIZED_DATA = "./test_data_nornalized.json"

### Run Summarization Model (pick which data)

In [None]:
# Load the fine-tuned summarization pipeline for chats
summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")

# Load new test data
with open(RAW_DATA, "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert timestamps to datetime and sort
for msg in data:
    msg["dt"] = datetime.strptime(msg["timestamp"], "%d/%m/%Y, %I:%M:%S %p")
data = sorted(data, key=lambda x: x["dt"])

# Group messages based on 5-minute time gap
grouped = []
current_group = []
prev_time = None
time_gap = timedelta(minutes=5)

for msg in data:
    if not prev_time or (msg["dt"] - prev_time) <= time_gap:
        current_group.append(msg)
    else:
        grouped.append(current_group)
        current_group = [msg]
    prev_time = msg["dt"]

if current_group:
    grouped.append(current_group)

# Summarize each group
summaries = []

for idx, group in enumerate(grouped):
    # Extract and deduplicate context lines
    context_lines = []
    seen = set()
    for msg in group:
        for line in msg.get("context", []):
            if line not in seen:
                context_lines.append(line)
                seen.add(line)

    full_text = "\n".join(context_lines)
    full_text = full_text[:1024]  # trim to fit model input

    try:
        summary = summarizer(full_text, max_length=60, min_length=15, do_sample=False)
        summaries.append({
            "group_id": idx,
            "summary": summary[0]["summary_text"],
            "messages": [m["id"] for m in group]
        })
    except Exception as e:
        summaries.append({
            "group_id": idx,
            "summary": f"Error: {e}",
            "messages": [m["id"] for m in group]
        })

# Save the generated summaries to a file
with open("summarized_by_time_test.json", "w", encoding="utf-8") as f:
    json.dump(summaries, f, ensure_ascii=False, indent=2)

print("✅ Summaries using fine-tuned BART (Samsum) saved to summarized_by_time_test.json")