### Summarization Model

In [1]:
from transformers import pipeline
import json
from datetime import datetime, timedelta

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
CONVERSATION_DATA_PATH = "./test.json"

#### bart-large-cnn

In [None]:
# Load summarization pipeline
print("1. Loading summarization pipeline...")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
print("✅ Pipeline loaded")

# Load data
print("2. Loading data...")
with open(CONVERSATION_DATA_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)["messages"]
print(f"✅ Loaded {len(data)} messages")

# Convert timestamps to datetime and sort
print("3. Processing timestamps...")
for msg in data:
    msg["dt"] = datetime.strptime(msg["timestamp"], "%d/%m/%Y, %I:%M:%S %p")
data = sorted(data, key=lambda x: x["dt"])
print("✅ Timestamps processed and sorted")

# Group messages based on 15-min time gap
print("4. Grouping messages...")
grouped = []
current_group = []
prev_time = None
time_gap = timedelta(minutes=15)

for i, msg in enumerate(data):
    if not prev_time or (msg["dt"] - prev_time) <= time_gap:
        current_group.append(msg)
    else:
        grouped.append(current_group)
        current_group = [msg]
    prev_time = msg["dt"]
    if i % 10 == 0:
        print(f"Processed {i}/{len(data)} messages")

if current_group:
    grouped.append(current_group)
print(f"✅ Messages grouped into {len(grouped)} conversations")

# Summarize each group
print("5. Generating summaries...")
summaries = []

for idx, group in enumerate(grouped):
    print(f"\nProcessing group {idx + 1}/{len(grouped)}")
    convo = []

    for msg in group:
        if "context" in msg:
            convo.extend(msg["context"])
        line = f"[{msg['timestamp']}] {msg['speaker']}: {msg['text']}"
        convo.append(line)

    full_text = "\n".join(convo)
    print(f"Text length: {len(full_text)} characters")

    try:
        print("Generating summary...")
        summary = summarizer(full_text, max_length=80, min_length=20, do_sample=False)
        print(f"Summary: {summary[0]['summary_text']}")
        summaries.append({
            "group_id": idx,
            "summary": summary[0]["summary_text"],
            "messages": [m["id"] for m in group]
        })
    except Exception as e:
        print(f"❌ Error in group {idx}: {e}")
        summaries.append({
            "group_id": idx,
            "summary": f"Error: {e}",
            "messages": [m["id"] for m in group]
        })

# Save to file
print("\n6. Saving results...")
with open("summarized_by_time.json", "w", encoding="utf-8") as f:
    json.dump(summaries, f, ensure_ascii=False, indent=2)

print("✅ Summaries grouped by 15-minute conversation bursts saved to summarized_by_time.json")

#### bart-large-cnn-samsum

In [None]:
# Load the fine-tuned summarization pipeline for chats
summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")

# Load new test data
with open(CONVERSATION_DATA_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert timestamps to datetime and sort
for msg in data:
    msg["dt"] = datetime.strptime(msg["timestamp"], "%d/%m/%Y, %I:%M:%S %p")
data = sorted(data, key=lambda x: x["dt"])

# Group messages based on 15-minute time gap
grouped = []
current_group = []
prev_time = None
time_gap = timedelta(minutes=15)

for msg in data:
    if not prev_time or (msg["dt"] - prev_time) <= time_gap:
        current_group.append(msg)
    else:
        grouped.append(current_group)
        current_group = [msg]
    prev_time = msg["dt"]

if current_group:
    grouped.append(current_group)

# Summarize each group
summaries = []

for idx, group in enumerate(grouped):
    # Extract and deduplicate context lines
    context_lines = []
    seen = set()
    for msg in group:
        for line in msg.get("context", []):
            if line not in seen:
                context_lines.append(line)
                seen.add(line)

    full_text = "\n".join(context_lines)
    full_text = full_text[:1024]  # trim to fit model input

    try:
        summary = summarizer(full_text, max_length=60, min_length=15, do_sample=False)
        summaries.append({
            "group_id": idx,
            "summary": summary[0]["summary_text"],
            "messages": [m["id"] for m in group]
        })
    except Exception as e:
        summaries.append({
            "group_id": idx,
            "summary": f"Error: {e}",
            "messages": [m["id"] for m in group]
        })

# Save the generated summaries to a file
with open("summarized_by_time_test.json", "w", encoding="utf-8") as f:
    json.dump(summaries, f, ensure_ascii=False, indent=2)

print("✅ Summaries using fine-tuned BART (Samsum) saved to summarized_by_time_test.json")