In [1]:
import pandas as pd
from typing import List, Dict, Tuple, Optional
import json
import re
import torch
import time

In [2]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/stanfordnlp/coqa/" + splits["train"])

Getting dialogues with 20 questions from the dataset

In [3]:
rows_numb = []
for row in range(len(df['questions'])):
    if len(df['questions'][row]) == 20:
        rows_numb.append(row)

In [None]:
rows_100 = rows_numb[:100]

In [4]:
rows_1000 = rows_numb[:1000]

In [5]:
def extract_json_from_response(response: str) -> Optional[Dict]:
    """Extract and parse JSON from LLM response with error handling."""
    try:
        # Try direct JSON parsing first
        return json.loads(response)
    except json.JSONDecodeError:
        # Look for JSON-like structure in the response
        json_match = re.search(r'\{[^{}]*\}', response)
        if json_match:
            try:
                return json.loads(json_match.group(0))
            except json.JSONDecodeError:
                pass
    
    # Fallback: try to find confidence value directly
    confidence_match = re.search(r'"confidence":\s*([0-9]*\.?[0-9]+)', response)
    if confidence_match:
        return {"confidence": float(confidence_match.group(1))}
    
    return None

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model="Qwen/Qwen3-4B-Instruct-2507")

# Approach 1

### Step 1: extract topic

In this approach, topics are extracted based solely on the first 60% of users’ questions. These questions will be refered as promt questions (PQ) going forward.

In [None]:
def extract_main_topic(pipe, questions: list[str], temperature: float = 0.7) -> str:
    """Extract the main topic/theme from a set of related questions from the same user context."""
    
    system_prompt = """
You are a topic extraction expert. Your task is to analyze multiple questions from the same user and identify the single main topic or theme that connects all of them.

**Instructions:**
1. Analyze all the provided questions together
2. Identify the core subject, domain, or theme that unifies these questions
3. The questions are from the same user context, so they likely revolve around one central topic
4. Output ONLY a JSON object with a single "topic" field
5. The main topic should be concise (2-8 words maximum) and capture the essence of all questions

**Output format (STRICTLY follow this - no other text):**
{
  "topic": "possibility of intelligent life existing on other planets"
}
"""
    
    questions_formatted = "\n".join([f"- Question {i+1}: \"{q}\"" for i, q in enumerate(questions)])
    
    user_prompt = f"""
**Questions from the same user context:**
{questions_formatted}

Now analyze these questions and extract the single main topic that connects them all:
"""
    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
    ]

    response = pipe(
            messages,
            temperature=temperature,
            do_sample=False,
            pad_token_id=pipe.tokenizer.eos_token_id
        )
        
    # Extract generated text
    if isinstance(response, list) and len(response) > 0:
        if isinstance(response[0], dict) and 'generated_text' in response[0]:
            generated_text = response[0]['generated_text'][-1]['content']
        else:
            generated_text = str(response[0])
    else:
        generated_text = str(response)
    
    result = json.loads(generated_text)
    if "topic" in result:
        return str(result["topic"]).strip()

### Step 2: extract aspects

Next, aspects are identified using both topic and PQ.

In [None]:
def extract_topic_aspects(pipe, topic: str, questions: list[str], temperature: float = 0.7) -> list[str]:
    """Extract specific aspects/details of a known topic that the user is interested in based on their questions."""
    
    system_prompt = """
You are an aspect extraction expert. Your task is to analyze questions about a specific topic and identify the key aspects, details, or dimensions of that topic the user is interested in.

**Instructions:**
1. You are given a main topic and multiple questions about that topic from the same user
2. Analyze the questions to identify what specific aspects, details, or sub-topics of the main topic the user cares about
3. Focus on concrete details mentioned or implied in the questions
4. Output ONLY a JSON object with a single "aspects" field containing a list of strings
5. Each aspect should be concise (2-6 words) and represent a distinct detail or dimension of the main topic
6. Include only aspects that are clearly supported by the questions

**Output format (STRICTLY follow this - no other text):**
{
  "aspects": ["scientific reasoning for existence", "detection methods and technology", "expert predictions and timeline"]
}

**Examples:**
- Topic: "possibility of intelligent life existing on other planets" + Questions about evidence, which planet, methods, timeline, and experts → [age of universe, Earth-like planets,  radio telescopes, timeline for contact]
- Topic: "university" + Questions about founders, buildings, programs → ["founding history", "campus architecture", "academic programs"]
"""
    
    questions_formatted = "\n".join([f"- Question {i+1}: \"{q}\"" for i, q in enumerate(questions)])
    
    user_prompt = f"""
**Main Topic:** "{topic}"

**User Questions about this topic:**
{questions_formatted}

Now analyze these questions and extract the specific aspects or details of "{topic}" that the user is interested in:
"""
    
    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
    ]
    
    response = pipe(
        messages,
        temperature=temperature,
        do_sample=False,
        pad_token_id=pipe.tokenizer.eos_token_id
    )
    
    # Extract generated text
    if isinstance(response, list) and len(response) > 0:
        if isinstance(response[0], dict) and 'generated_text' in response[0]:
            generated_text = response[0]['generated_text'][-1]['content']
        else:
            generated_text = str(response[0])
    else:
        generated_text = str(response)

    result = json.loads(generated_text)
    if "aspects" in result and isinstance(result["aspects"], list):
        return [str(aspect).strip() for aspect in result["aspects"] if aspect]

### Step 3: generate questions
Generate 8 questions using obtained topic, aspects and PQ.

In [1]:
def generate_questions(pipe, topic: str, aspects: list[str], questions: list[str], temperature: float = 0.7) -> list[str]:
    """Generate possible follow-up questions a user might ask based on topic, aspects, and previously asked questions."""
    
    system_prompt = """
You are a question generation expert. Your task is to generate new, relevant questions that a user might ask about a specific topic, based on the aspects they're interested in and avoiding questions they've already asked.

**Instructions:**
1. You are given a main topic, key aspects of that topic, and questions the user has already asked
2. Generate 8 NEW questions that:
   - Are relevant to the main topic
   - Cover the specified aspects that haven't been fully explored yet
   - Are different from the already asked questions (avoid rephrasing the same questions)
   - Sound natural and like something a real user would ask
   - Focus on details, specifics, or related information the user might want to know next
3. Output ONLY a JSON object with a single "questions" field containing a list of strings
4. Each question should be clear, concise, and self-contained

**Output format (STRICTLY follow this - no other text):**
{
  "questions": [
    "What galaxy do we live in?",
    "How many stars does it have?",
    "Does Shostak believe beings from space could make contact with Earth soon?"
  ]
}

**Important Guidelines:**
- DO NOT repeat or rephrase the already asked questions
- DO cover different aspects or dive deeper into aspects that weren't fully explored
- DO make questions specific and actionable
- DO maintain a natural, conversational tone
- DO focus on what the user might want to know next based on their interests
"""
    
    aspects_formatted = ", ".join([f'"{aspect}"' for aspect in aspects])
    questions_formatted = "\n".join([f"- Question {i+1}: \"{q}\"" for i, q in enumerate(questions)])
    
    user_prompt = f"""
**Main Topic:** "{topic}"

**Key Aspects of Interest:** {aspects_formatted}

**Questions Already Asked:**
{questions_formatted}

Now generate new, relevant questions that the user might ask next. These questions should explore the topic and aspects in new ways, avoiding repetition of the already asked questions:
"""
    
    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
    ]

    response = pipe(
        messages,
        temperature=temperature,
        do_sample=True,
        pad_token_id=pipe.tokenizer.eos_token_id
    )
    
    # Extract generated text
    if isinstance(response, list) and len(response) > 0:
        if isinstance(response[0], dict) and 'generated_text' in response[0]:
            generated_text = response[0]['generated_text'][-1]['content']
        else:
            generated_text = str(response[0])
    else:
        generated_text = str(response)

    try:
        result = json.loads(generated_text)
        if "questions" in result and isinstance(result["questions"], list):
            # Filter out empty questions and limit to 20
            questions = [str(q).strip() for q in result["questions"] if q and q.strip()]
            return questions
    except json.JSONDecodeError:
        # Try to find JSON pattern in the response
        json_match = re.search(r'\{[^{}]*"questions"[^{}]*\}', generated_text)
        if json_match:
            result = json.loads(json_match.group(0))
            if "questions" in result and isinstance(result["questions"], list):
                questions = [str(q).strip() for q in result["questions"] if q and q.strip()]
                return questions

### Step 4: use LLM-as-a-judge technique to compare questions
This function is comparing one question with another on a scale from 0 to 1, where 0 means that questions are absolutely different and 1 means that they are definitely equivalent.

In [7]:
def compare_questions(pipe, question1: str, question2: str, 
                     temperature: float = 0.7, max_new_tokens: int = 100) -> float:

    sys_prompt = """
You are a semantic equivalence evaluator. Your task is to compare two questions and determine the confidence that they are semantically equivalent.

**Instructions:**
1. Compare the meaning and intent of the two questions
2. Determine if they're asking for the same information and would yield the same answer
3. Output ONLY a JSON object with a single "confidence" field
4. The confidence must be a number between 0 and 1

**Explanation of confidence scores:**
- 0.9-1.0: Definitely equivalent (same meaning, intent, and expected answer)
- 0.7-0.9: Highly likely equivalent (minor wording differences only)
- 0.5-0.7: Possibly equivalent (similar intent but different phrasing)
- 0.3-0.5: Unlikely equivalent (related but different aspects)
- 0.0-0.3: Definitely different (completely different questions)

**Output format (STRICTLY follow this - no other text):**
{
  "confidence": 0.85
}
"""

    prompt = f"""
**Questions to compare:**
Question 1: "{question1}"
Question 2: "{question2}"
"""

    messages = [
    {"role": "system", "content": sys_prompt.strip()},
    {"role": "user", "content": prompt.strip()}
]

    response = pipe(
        messages,
        temperature=temperature,
        do_sample=False
    )

    # Extract generated text
    if isinstance(response, list) and len(response) > 0:
        if isinstance(response[0], dict) and 'generated_text' in response[0]:
            generated_text = response[0]['generated_text'][-1]['content']
        else:
            generated_text = str(response[0])
    else:
        generated_text = str(response)

    result = extract_json_from_response(generated_text)
    
    confidence = float(result["confidence"])
    return max(0.0, min(1.0, confidence))

### Execute first approach and save results. In this work, the first 100 dialogues with 20 questions were considered for this approach.

In [None]:
results = {}

In [None]:
for row in rows_100:
    print(row)
    questions = df['questions'].loc[row].tolist()[:int(len(df['questions'].loc[row].tolist()) * 0.6)]
    topic = extract_main_topic(pipe, questions)
    aspects = extract_topic_aspects(pipe, topic, questions)
    gen_questions = generate_questions(pipe, topic, aspects, questions)

    comparisons = {}
    remaining_questions = df['questions'].loc[row].tolist()[int(len(df['questions'].loc[row].tolist()) * 0.6):]
    for i, q_a in enumerate(gen_questions):
        comparisons[q_a] = {}
        for j, q_b in enumerate(remaining_questions):
            confidence = compare_questions(pipe, q_a, q_b)
            comparisons[q_a][q_b] = confidence

    results_app1[f'row {row}'] = {
    "topic": topic,
    "aspects": aspects,
    "gen_questions": gen_questions,
    "comparisons": comparisons
    }

In [None]:
results_app1_df = pd.DataFrame(results_app1)
results_app1_df.T

In [None]:
# results_app1_df.T.to_csv('/kaggle/working/results_app1.csv')

# Approach 2

### Step 1: collect topics
This functions extracts topics from dialogues's context. This step is for further combining dialogues into clusters based on topics. The repository already contains a file with all the extracted topics, so this step is optional.

In [None]:
# def extract_text_main_topic(pipe, text: str, temperature: float = 0.7) -> str:
#     """Extract the single main topic/theme from a provided text passage."""
    
#     system_prompt = """
# You are a topic extraction expert. Your task is to analyze a text passage and identify the single main topic or central theme that best represents the entire content.

# **Instructions:**
# 1. Carefully read and understand the entire text passage
# 2. Identify the core subject, central theme, or primary focus of the text
# 3. The main topic should be the overarching concept that ties all parts of the text together
# 4. Output ONLY a JSON object with a single "main_topic" field
# 5. The main topic should be concise (2-5 words maximum) and capture the essence of the text
# 6. Be specific and accurate - avoid generic terms like "story" or "text"

# **Output format (STRICTLY follow this - no other text):**
# {
#   "main_topic": "possibility of intelligent life existing on other planets"
# }
# """
    
#     user_prompt = f"""
# **Text passage to analyze:**
# {text}

# Now identify the single main topic that best represents this entire text:
# """
    
#     messages = [
#         {"role": "system", "content": system_prompt.strip()},
#         {"role": "user", "content": user_prompt.strip()}
#     ]

#     response = pipe(
#             messages,
#             temperature=temperature,
#             do_sample=False,
#             pad_token_id=pipe.tokenizer.eos_token_id
#         )
        
#     # Extract generated text
#     if isinstance(response, list) and len(response) > 0:
#         if isinstance(response[0], dict) and 'generated_text' in response[0]:
#             generated_text = response[0]['generated_text'][-1]['content']
#         else:
#             generated_text = str(response[0])
#     else:
#         generated_text = str(response)
    
#     result = json.loads(generated_text)
#     if "main_topic" in result:
#         return str(result["main_topic"]).strip()

In [None]:
# results_topics = {}

In [None]:
# for row in rows_1000:
#     if row % 10 == 0:
#         print(row)
#     main_topic = extract_text_main_topic(pipe, df['story'][row])

#     results_topics[f'row {row}'] = {
#     "main_topic": main_topic
#     }

In [None]:
# results_topics_df = pd.DataFrame(results_topics)

In [None]:
# results_topics_df.T.to_csv('/kaggle/working/topics.csv')

In [8]:
topics = pd.read_csv('/kaggle/input/topics/topics.csv')
all_topics = topics['main_topic'].tolist()
row_identifiers = topics['Unnamed: 0'].str.extract(r'row\s*(\d+)')[0].astype(int).tolist()

### Step 2: cluster topics using DBSCAN

In [9]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
import numpy as np
from scipy.spatial.distance import cosine

def cluster_topics(topics_list, row_identifiers, min_cluster_size=3, eps=0.4):
    """
    Cluster topics using sentence embeddings and DBSCAN
    Returns: dict mapping cluster_id to list of (dialogue_index, topic)
    """
    # Load pre-trained model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Generate embeddings
    embeddings = model.encode(topics_list, show_progress_bar=True)
    
    # Cluster using DBSCAN
    clustering = DBSCAN(eps=eps, min_samples=min_cluster_size, metric='cosine')
    cluster_labels = clustering.fit_predict(embeddings)
 
    clusters = {}
    for idx, (label, topic) in enumerate(zip(cluster_labels, topics_list)):
        row_id = row_identifiers[idx] 
        if label == -1:
            continue
        if label not in clusters:
            clusters[label] = []
        clusters[label].append((row_id, topic))
    
    return clusters

In [None]:
clusters = cluster_topics(all_topics, row_identifiers)

In [22]:
def extract_cluster_questions(df, cluster_dialogues):
    cluster_qs = {}
    
    for dialogue_idx, topic in cluster_dialogues:
        # Get all questions for this dialogue
        all_questions = df['questions'].loc[dialogue_idx].tolist()
        
        # Calculate 60% cutoff
        cutoff_idx = int(len(all_questions) * 0.6)
        
        # Extract first 60% of questions
        first_60_percent = all_questions[:cutoff_idx]

        cluster_qs[dialogue_idx] = {
            'first_60_percent': first_60_percent
        }
    
    return cluster_qs

def extract_cluster_last_questions(df, cluster_dialogues):
    cluster_qs_test = {}
    
    for dialogue_idx, topic in cluster_dialogues:
        # Get all questions for this dialogue
        all_questions = df['questions'].loc[dialogue_idx].tolist()
        
        # Calculate starting index for last 60%
        start_idx = int(len(all_questions) * 0.6)
        
        # Extract last 40% of questions
        last_40_percent = all_questions[start_idx:]

        cluster_qs_test[dialogue_idx] = {
            'last_40_percent': last_40_percent
        }
    
    return cluster_qs_test

### Step 3: extract dialogue's aspects.
In addition to the topic and PQ of the dialogue under consideration, PQ from other dialogues of the cluster are also used.

In [12]:
def extract_aspects_cluster(pipe, current_topic: str, current_questions: list[str], cluster_questions: list[list[str]], temperature: float = 0.7) -> list[str]:
    """Extract detailed aspects of a topic using context from all dialogues in the same cluster."""
    
    system_prompt = """
You are an aspect extraction expert specializing in contextual analysis. Your task is to identify specific details and dimensions of a topic by analyzing questions from:
1. The current dialogue (primary focus)
2. Related dialogues in the same topic cluster (providing broader context)

**Instructions:**
1. Analyze ALL provided questions together - they all relate to the same core topic cluster
2. Identify specific aspects, details, or dimensions of the main topic that users are interested in
3. Focus on concrete details mentioned or implied across the dialogues
4. Prioritize aspects that appear in multiple dialogues OR are deeply explored in the current dialogue
5. Output ONLY a JSON object with a single "aspects" field containing a list of strings
6. Each aspect should be concise (2-6 words) and represent a distinct detail or sub-topic
7. Include only aspects strongly supported by the questions

**Output format (STRICTLY follow this - no other text):**
{
  "aspects": ["album release dates", "auction bidding processes", "memorabilia authentication"]
}

**Critical Guidelines:**
- LEVERAGE CLUSTER CONTEXT: Use questions from related dialogues to discover aspects not explicitly mentioned in the current dialogue
- AVOID GENERIC ASPECTS: Be specific and concrete (e.g., "vinyl condition grading" not just "records")
- PRIORITIZE RECURRING THEMES: Aspects appearing across multiple dialogues are more significant
- MAINTAIN TOPIC FOCUS: All aspects must directly relate to the core topic cluster
"""

    current_formatted = "\n".join([f"- Question {i+1}: \"{q}\"" for i, q in enumerate(current_questions)])
    
    # Format cluster PQ
    cluster_context = []
    for i, dialogue_questions in enumerate(cluster_questions[:3]):  # Limit to 3 dialogues
        dialogue_sample = dialogue_questions
        cluster_context.append(f"Related Dialogue {i+1} questions:\n" + "\n".join([f"  • \"{q}\"" for q in dialogue_sample]))
    cluster_formatted = "\n\n".join(cluster_context)
    
    user_prompt = f"""
**CORE TOPIC CLUSTER:** "{current_topic}"

**CURRENT DIALOGUE QUESTIONS (primary focus):**
{current_formatted}

**ADDITIONAL CONTEXT FROM RELATED DIALOGUES (same topic cluster):**
{cluster_formatted}

Analyze ALL questions above to extract the most significant specific aspects of "{current_topic}" that users care about:
"""
    
    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
    ]

    response = pipe(
        messages,
        temperature=temperature,
        do_sample=False,
        pad_token_id=pipe.tokenizer.eos_token_id
    )
    
    # Extract generated text
    if isinstance(response, list) and len(response) > 0:
        if isinstance(response[0], dict) and 'generated_text' in response[0]:
            generated_text = response[0]['generated_text'][-1]['content']
        else:
            generated_text = str(response[0])
    else:
        generated_text = str(response)

    result = json.loads(generated_text)
    if "aspects" in result and isinstance(result["aspects"], list):
        return [str(aspect).strip() for aspect in result["aspects"] if aspect]

### Step 4: generate questions
In addition to the topic, aspects and PQ of the dialogue under consideration, aspects and PQ from other dialogues of the cluster are also used.

In [27]:
def generate_questions_cluster(pipe, current_topic: str, current_aspects: list[str], current_questions: list[str], cluster_context: list[dict], temperature: float = 0.7, max_new_tokens: int = 500) -> list[str]:
    """Generate context-aware questions for a dialogue using enriched cluster information."""
    
    system_prompt = """
You are an expert question generator specializing in contextual dialogue expansion. Your task is to generate new, relevant questions for a user by leveraging:
1. The current dialogue's specific focus (primary priority)
2. Broader context from related dialogues in the same topic cluster

**Instructions:**
1. Analyze ALL provided information about the topic cluster
2. Generate 8 NEW questions that:
   - Are highly relevant to the CURRENT dialogue's specific aspects and questions
   - Incorporate valuable details discovered from OTHER dialogues in the cluster
   - Avoid repeating or rephrasing questions already asked in the current dialogue
   - Explore unexplored dimensions of the topic revealed by the cluster context
   - Sound natural and conversational (like a real user would ask)
3. Prioritize questions that:
   - Bridge the current dialogue's focus with complementary aspects from cluster context
   - Address obvious gaps where cluster context reveals important aspects missing in current dialogue
   - Maintain specificity to the current dialogue's unique angle within the broader topic
4. Output ONLY a JSON object with a single "questions" field containing a list of strings
5. Each question must be clear, concise, and self-contained

**Output format (STRICTLY follow this - no other text):**
{
  "questions": [
    "What authentication methods are used for rare Michael Jackson memorabilia?",
    "How do posthumous album releases affect the value of original memorabilia?",
    "What was the highest bid ever recorded for Jackson's signature glove?"
  ]
}

**Critical Guidelines:**
- CURRENT DIALOGUE IS PRIMARY: Never generate questions irrelevant to the current dialogue's specific focus
- CLUSTER CONTEXT IS ENRICHMENT: Use other dialogues to discover NEW angles, not to change the core topic
- AVOID REPETITION: Do not generate questions semantically equivalent to already asked questions
- BE SPECIFIC: Leverage concrete details from cluster aspects
- MAINTAIN CONVERSATIONAL FLOW: Questions should feel like natural follow-ups to the current discussion
"""
    
    # Format current dialogue information
    current_aspects_formatted = ", ".join([f'"{a}"' for a in current_aspects])
    current_questions_formatted = "\n".join([f"- \"{q}\"" for q in current_questions])
    
    # Format cluster context
    cluster_context_formatted = []
    for i, context in enumerate(cluster_context[:3]):
        aspects = context.get('aspects', [])
        questions = context.get('questions', [])
        
        if aspects or questions:
            dialogue_context = f"Related Dialogue {i+1}:\n"
            if aspects:
                dialogue_context += f"Key aspects: {', '.join([f'\"{a}\"' for a in aspects])}\n"
            if questions:
                dialogue_context += "Sample questions:\n" + "\n".join([f"  • \"{q}\"" for q in questions])
            cluster_context_formatted.append(dialogue_context)
    
    cluster_formatted = "\n\n".join(cluster_context_formatted)
    
    user_prompt = f"""
**CURRENT DIALOGUE FOCUS**
Topic: "{current_topic}"
Specific Aspects: {current_aspects_formatted}
Already Asked Questions:
{current_questions_formatted}

**ENRICHMENT FROM TOPIC CLUSTER CONTEXT**
{cluster_formatted}

Generate 8 NEW questions that would naturally follow in the CURRENT dialogue, leveraging insights from the cluster context while staying focused on the current dialogue's specific aspects:
"""
    
    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
    ]

    response = pipe(
        messages,
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        pad_token_id=pipe.tokenizer.eos_token_id
    )
    
    # Extract generated text
    if isinstance(response, list) and len(response) > 0:
        if isinstance(response[0], dict) and 'generated_text' in response[0]:
            generated_text = response[0]['generated_text'][-1]['content']
        else:
            generated_text = str(response[0])
    else:
        generated_text = str(response)

    try:
        result = json.loads(generated_text)
        if "questions" in result and isinstance(result["questions"], list):
            questions = [str(q).strip() for q in result["questions"] if q and q.strip()]
            return questions
    except json.JSONDecodeError:
        json_match = re.search(r'\{[^{}]*"questions"[^{}]*\}', generated_text)
        if json_match:
            result = json.loads(json_match.group(0))
            if "questions" in result and isinstance(result["questions"], list):
                questions = [str(q).strip() for q in result["questions"] if q and q.strip()]
                return questions

### Execute second approach and save results.

In [None]:
results_app2={}

In [None]:
for cl_idx, cluster_dialogues in enumerate(clusters.values()):
    print(f"Processing cluster {cl_idx}")

    cluster_questions_dict = extract_cluster_questions(df, cluster_dialogues)

    dialogue_aspects = {}
    for dialogue_idx, topic in cluster_dialogues:
        current_questions = cluster_questions_dict[dialogue_idx]['first_60_percent']
        cluster_context_questions = [
            cluster_questions_dict[other_idx]['first_60_percent'] 
            for other_idx in cluster_questions_dict.keys() 
            if other_idx != dialogue_idx
        ]

        dialogue_aspects[dialogue_idx] = extract_aspects_cluster(
            pipe,
            current_topic=topic,
            current_questions=current_questions,
            cluster_questions=cluster_context_questions
        )
    
    # Generate questions and perform comparisons for each dialogue
    cluster_results = {"dialogues": {}}

    for dialogue_idx, topic in cluster_dialogues:
        current_questions = cluster_questions_dict[dialogue_idx]['first_60_percent']
        current_aspects = dialogue_aspects[dialogue_idx]

        cluster_context = []
        for other_idx, other_topic in cluster_dialogues:
            if other_idx != dialogue_idx:
                cluster_context.append({
                    'aspects': dialogue_aspects[other_idx],
                    'questions': cluster_questions_dict[other_idx]['first_60_percent']
                })

        gen_questions = generate_questions_cluster(
            pipe,
            current_topic=topic,
            current_aspects=current_aspects,
            current_questions=current_questions,
            cluster_context=cluster_context
        )

        remaining_questions = extract_cluster_last_questions(df, [(dialogue_idx, topic)])
        dialogue_remaining = remaining_questions[dialogue_idx]['last_40_percent']
        
        # Compare generated questions with remaining questions
        comparisons = {}
        for q_a in gen_questions:
            comparisons[q_a] = {}
            for q_b in dialogue_remaining:
                confidence = compare_questions(pipe, q_a, q_b)
                comparisons[q_a][q_b] = confidence
        
        # Store results for this dialogue
        cluster_results["dialogues"][dialogue_idx] = {
            "topic": topic,
            "aspects": current_aspects,
            "gen_questions": gen_questions,
            "comparisons": comparisons
        }

    results_app2[f'cluster {cl_idx}'] = cluster_results

In [None]:
results_app2_df = pd.DataFrame(results_app2)

In [None]:
# results_app2_df.T.to_csv('/kaggle/working/results_app2_df.csv')

# Approach 3

### Step 1: generate questions based on dialogue's context and PQ

In [None]:
def generate_questions_from_text_with_questions(pipe, text: str, questions: list[str], temperature: float = 0.7, max_new_tokens: int = 500) -> list[str]:
    
    system_prompt = """
You are a question generation expert. Your task is to analyze a given text passage and generate relevant, insightful questions that a reader might ask about the content, while avoiding questions that have already been asked.

**Instructions:**
1. Carefully read and understand the provided text
2. Review the list of already asked questions to avoid duplication
3. Generate 8 diverse questions that:
   - Cover key facts, details, and concepts from the text that haven't been explored yet
   - Explore implications, context, and deeper meaning not covered in existing questions
   - Ask about specific examples, dates, names, or events mentioned in the text
   - Probe relationships between different elements in the text
   - Question assumptions or explore alternative perspectives
   - Are natural and conversational (not robotic)
4. Ensure questions are:
   - Clear, concise, and self-contained
   - Based directly on the text content (don't invent facts)
   - Varied in type (who, what, when, where, why, how, etc.)
   - Appropriate for the text's complexity and tone
   - COMPLETELY DIFFERENT from the already asked questions (no rephrasing or similar intent)
5. Output ONLY a JSON object with a single "questions" field containing a list of strings
6. Include exactly 8 questions - no more, no less

**Output format (STRICTLY follow this - no other text):**
{
  "questions": [
    "What galaxy do we live in?",
    "How many stars does it have?",
    "Does Shostak believe beings from space could make contact with Earth soon?"
  ]
}
"""

    questions_formatted = "\n".join([f"- Question {i+1}: \"{q}\"" for i, q in enumerate(questions)])
    
    user_prompt = f"""
**Text to analyze:**
{text}

**Questions Already Asked:**
{questions_formatted}

Now generate 8 thoughtful questions based on this text content:
"""
    
    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()}
    ]
    
    response = pipe(
        messages,
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        pad_token_id=pipe.tokenizer.eos_token_id
    )
    
    # Extract generated text
    if isinstance(response, list) and len(response) > 0:
        if isinstance(response[0], dict) and 'generated_text' in response[0]:
            generated_text = response[0]['generated_text'][-1]['content']
        else:
            generated_text = str(response[0])
    else:
        generated_text = str(response)

    try:
        result = json.loads(generated_text)
        if "questions" in result and isinstance(result["questions"], list):
            # Filter out empty questions and limit to 20
            questions = [str(q).strip() for q in result["questions"] if q and q.strip()]
            return questions
    except json.JSONDecodeError:
        # Try to find JSON pattern in the response
        json_match = re.search(r'\{[^{}]*"questions"[^{}]*\}', generated_text)
        if json_match:
            result = json.loads(json_match.group(0))
            if "questions" in result and isinstance(result["questions"], list):
                questions = [str(q).strip() for q in result["questions"] if q and q.strip()]
                return questions

### Execute third approach and save results

In [None]:
reults_app3 = {}

In [None]:
for row in rows_100:
    print(row)
    remaining_questions = df['questions'].loc[row].tolist()[int(len(df['questions'].loc[row].tolist()) * 0.6):]
    gen_questions = generate_questions_from_text_with_questions(pipe, df['story'][row], remaining_questions)

    comparisons = {}
    for i, q_a in enumerate(gen_questions):
        comparisons[q_a] = {}
        for j, q_b in enumerate(remaining_questions):
            confidence = compare_questions(pipe, q_a, q_b)
            comparisons[q_a][q_b] = confidence

    results_app3[f'row {row}'] = {
    "gen_questions": gen_questions,
    "comparisons": comparisons
    }

In [None]:
results_app3_df = pd.DataFrame(results_app3)

In [None]:
# results_app3_df.to_csv('/kaggle/working/results_app3.csv')