# Question Generation and Clustering

In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from info_salience import qgen

In [11]:
import json
from typing import List, Dict
from pathlib import Path
from collections import defaultdict

import pandas as pd
import outlines
from sentence_transformers import SentenceTransformer
from json_repair import repair_json
from dotenv import find_dotenv, load_dotenv
import hdbscan
import umap
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from InstructorEmbedding import INSTRUCTOR

In [12]:
load_dotenv(find_dotenv())

True

In [13]:
from info_salience.llm import LitellmGenerator

llm = LitellmGenerator(
    "gpt-4o-2024-08-06",
    caching=True,
    report_costs=True,
    disk_cache_dir=f".litellm_cache",
)

This is the interface

```python
def generate_questions(df_summaries) -> List[str]:
    pass


def cluster_questions(questions):
    # embed
    # reduce dim
    # cluster
    return embeddings, assignments


def calculate_centroids(questions, embeddings, assignments) -> Dict[int,str]:
    pass


def aggregate_data(questions, assignments, centroids):
    return df_agg


def generate_report(df_agg):
    pass
```

In [6]:
def calculate_centroids(questions, embeddings, assignments) -> Dict[int, str]:
    grouped = defaultdict(list)
    for cluster_id, question_id in zip(assignments, range(len(questions))):
        grouped[cluster_id].append(question_id)

    centroids = {}
    for cluster_id, question_ids in grouped.items():
        mean_embedding = np.mean(embeddings[question_ids], axis=0)
        scores = cosine_similarity([mean_embedding], embeddings[question_ids])
        best_match_id = question_ids[scores.argmax()]
        centroid = questions[best_match_id]
        centroids[cluster_id] = centroid
    return centroids


def aggregate_data(questions, assignments, centroids):
    df_questions = pd.DataFrame({"question": questions, "cluster_id": assignments})
    df_centroids = pd.DataFrame(
        {"cluster_id": centroids.keys(), "centroid": centroids.values()}
    )

    df_agg = pd.merge(
        df_questions.groupby("cluster_id")["question"].apply(list),
        df_centroids,
        on="cluster_id",
    )
    df_agg["cluster_size"] = df_agg["question"].apply(len)
    df_agg = df_agg.set_index("cluster_id")
    return df_agg


def generate_report(df_agg):
    n_questions = df_agg["cluster_size"].sum()
    n_clusters = len(df_agg)

    print("=" * 80)
    print(f"Total generated questions: {n_questions}")
    print(f"Number of clusters: {n_clusters}")
    if -1 in df_agg.index:
        n_clusters = n_clusters - 1
        n_noise = df_agg.loc[-1]["cluster_size"]
        print(f"Classified as noise: {n_noise} ({n_noise/n_questions*100:.2f}%)")
    print("=" * 80)

    print(
        df_agg.loc[0:]
        .sort_values("cluster_size", ascending=False)[["centroid", "cluster_size"]]
        .to_markdown()
    )

    print()

    for index, row in df_agg.sort_values('cluster_size', ascending=False).iterrows():
        print('='*80)
        if index == -1:
            print('NOISE CLUSTER')
        print(row['centroid'])
        print(f'N = {len(row["question"])}')
        print('='*80)
        for question in row['question'][:10]:
            print(question)
        print(f'... {len(row["question"][10:])} others ...')
        print()

## Sample outputs contrastive prompt

In [8]:
df_summaries = pd.read_json('../output/cs-cl/Meta-Llama-3.1-8B-Instruct/summaries/temperature0.3-0.json')
df_summaries = df_summaries.sample(frac=1.0, random_state=42)  # shuffle
questions = qgen.generate_questions(llm, df_summaries.head(10), topic='cs-cl', debug=True)

Batch cost: $0.0288
questions_10_words
Q: What is the main focus of the research discussed?
Q: Which methods are highlighted in the research?
Q: What aspect of NLP does the document address?

questions_20_words
Q: What challenge or problem does the research aim to address?
Q: Which techniques or models are mentioned as a focus in the research?
Q: How does the research propose addressing the identified challenges?

questions_50_words
Q: What specific problem with existing models is being targeted in the research?
Q: Which strategies or innovations does the research propose for overcoming the challenges?
Q: In what way does the research aim to improve model performance?

questions_100_words
Q: What are the limitations of current approaches discussed in the research?
Q: How does the proposed method differ from existing solutions?
Q: Which specific techniques are compared or contrasted in the document?
Q: What are the intended outcomes of using the proposed method?

questions_200_words
Q: 

# Generate and cluster questions with contrastive prompt

In [None]:
from pathlib import Path

def generate_questions(summaries_json, topic, debug=False, force=False):
    summaries_json = Path(summaries_json)
    out_json = summaries_json.parent.parent / 'discord-qa-contrastive' / 'questions.json'
    if not force and out_json.exists():
        print(f'Load existing questions: {out_json}')
        with open(out_json) as fin:
            questions = json.load(fin)['questions']

    else:
        df_summaries = pd.read_json(summaries_json)
        df_summaries = df_summaries.sample(frac=1.0, random_state=42)  # shuffle
        questions = qgen.generate_questions(llm, df_summaries, topic=topic, debug=debug)

        out_json.parent.mkdir(exist_ok=True, parents=True)
        with open(out_json, 'w') as fout:
            print(f'Save {len(questions)} questions to {out_json}')
            json.dump({'questions': questions}, fout)

    return questions

In [None]:
def cluster_questions(questions, min_cluster_size=5, min_samples=None):
    minilm = SentenceTransformer("all-mpnet-base-v2")
    embeddings = minilm.encode(questions, normalize_embeddings=True)

    umap_model = umap.UMAP(
        n_neighbors=15,
        n_components=5,
        min_dist=0.0,
        metric='cosine',
        n_jobs=1,
        random_state=42
    )
    embeddings_ = umap_model.fit_transform(embeddings)

    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        cluster_selection_method='leaf',
    )
    clusterer.fit(embeddings_)
    return embeddings, clusterer.labels_

## astro-ph

In [16]:
questions = []
questions += generate_questions('../output/astro-ph/Meta-Llama-3.1-8B-Instruct/summaries/temperature0.3-0.json', 'astro-ph')
questions += generate_questions('../output/astro-ph/Mistral-7B-Instruct-v0.3/summaries/temperature0.3-0.json', 'astro-ph')
questions += generate_questions('../output/astro-ph/gpt-4o-2024-08-06/summaries/temperature0.3-0.json', 'astro-ph')

Batch cost: $0.3033
Save 362 questions to ../output/astro-ph/Meta-Llama-3.1-8B-Instruct/discord-qa-contrastive/questions.json
Batch cost: $0.3055
Save 369 questions to ../output/astro-ph/Mistral-7B-Instruct-v0.3/discord-qa-contrastive/questions.json
Batch cost: $0.3314
Save 383 questions to ../output/astro-ph/gpt-4o-2024-08-06/discord-qa-contrastive/questions.json


In [21]:
embeddings, assignments = cluster_questions(questions, min_cluster_size=15)
centroids = calculate_centroids(questions, embeddings, assignments)
df_agg = aggregate_data(questions, assignments, centroids)

In [22]:
generate_report(df_agg)

Total generated questions: 1114
Number of clusters: 20
Classified as noise: 273 (24.51%)
|   cluster_id | centroid                                                                           |   cluster_size |
|-------------:|:-----------------------------------------------------------------------------------|---------------:|
|           10 | What specific phenomena or processes are being investigated in the study?          |            109 |
|            8 | What methodology or techniques are employed in the study?                          |             99 |
|            3 | What specific challenges or limitations does the study address or identify?        |             66 |
|            9 | Which future research directions does the study recommend or outline?              |             59 |
|            7 | What methodologies or techniques are discussed?                                    |             56 |
|           18 | How do the findings relate to existing models or theories?   

In [23]:
df_agg.reset_index()[['cluster_id', 'centroid','cluster_size']] \
    .sort_values('cluster_size', ascending=False) \
    .to_csv('../data/annotations/questions/astro-ph-clusters-raw.csv', index=False)

## qmsum generic

In [24]:
questions = []
questions += generate_questions('../output/qmsum-generic/Meta-Llama-3.1-8B-Instruct/summaries/temperature0.3-0.json', 'qmsum')
questions += generate_questions('../output/qmsum-generic/Mistral-7B-Instruct-v0.3/summaries/temperature0.3-0.json', 'qmsum')
questions += generate_questions('../output/qmsum-generic/gpt-4o-mini-2024-07-18/summaries/temperature0.3-0.json', 'qmsum')

Batch cost: $0.2373
Save 294 questions to ../output/qmsum-generic/Meta-Llama-3.1-8B-Instruct/discord-qa-contrastive/questions.json
Batch cost: $0.2178
Save 300 questions to ../output/qmsum-generic/Mistral-7B-Instruct-v0.3/discord-qa-contrastive/questions.json
Batch cost: $0.2364
Save 284 questions to ../output/qmsum-generic/gpt-4o-mini-2024-07-18/discord-qa-contrastive/questions.json


In [25]:
embeddings, assignments = cluster_questions(questions, min_cluster_size=15)
centroids = calculate_centroids(questions, embeddings, assignments)
df_agg = aggregate_data(questions, assignments, centroids)

In [26]:
generate_report(df_agg)

Total generated questions: 878
Number of clusters: 19
Classified as noise: 293 (33.37%)
|   cluster_id | centroid                                                                          |   cluster_size |
|-------------:|:----------------------------------------------------------------------------------|---------------:|
|           10 | Which aspects of the main topic were covered in the discussion?                   |             64 |
|            4 | What main topic was discussed in the meeting?                                     |             50 |
|            9 | Who are the participants and their roles discussed in the meeting?                |             44 |
|            6 | Which design features were important in the discussion?                           |             44 |
|           17 | What were the major outcomes or decisions made during the meeting?                |             42 |
|           16 | What potential future steps or actions were planned in the meeting?  

In [27]:
df_agg.reset_index()[['cluster_id', 'centroid','cluster_size']] \
    .sort_values('cluster_size', ascending=False) \
    .to_csv('../data/annotations/questions/qmsum-generic-clusters-raw.csv', index=False)

## pubmed-sample

In [28]:
questions = []
questions += generate_questions('../output/pubmed-sample/Meta-Llama-3.1-8B-Instruct/summaries/temperature0.3-0.json', 'pubmed')
questions += generate_questions('../output/pubmed-sample/Mistral-7B-Instruct-v0.3/summaries/temperature0.3-0.json', 'pubmed')
questions += generate_questions('../output/pubmed-sample/gpt-4o-2024-08-06/summaries/temperature0.3-0.json', 'pubmed')

Load existing questions: ../output/pubmed-sample/Meta-Llama-3.1-8B-Instruct/discord-qa-contrastive/questions.json
Load existing questions: ../output/pubmed-sample/Mistral-7B-Instruct-v0.3/discord-qa-contrastive/questions.json
Batch cost: $0.5912
Save 657 questions to ../output/pubmed-sample/gpt-4o-2024-08-06/discord-qa-contrastive/questions.json


In [38]:
embeddings, assignments = cluster_questions(questions, min_cluster_size=15)
centroids = calculate_centroids(questions, embeddings, assignments)
df_agg = aggregate_data(questions, assignments, centroids)

In [39]:
generate_report(df_agg)

Total generated questions: 2016
Number of clusters: 38
Classified as noise: 609 (30.21%)
|   cluster_id | centroid                                                                                 |   cluster_size |
|-------------:|:-----------------------------------------------------------------------------------------|---------------:|
|           34 | What specific treatments were compared in the study?                                     |             67 |
|           20 | What were the methods used in the study?                                                 |             63 |
|           11 | What was the study design or setting of the trial?                                       |             58 |
|           30 | What was the main finding regarding the intervention's efficacy?                         |             58 |
|           17 | Which population is the study focused on?                                                |             53 |
|            2 | What limitations or

In [40]:
df_agg.reset_index()[['cluster_id', 'centroid','cluster_size']] \
    .sort_values('cluster_size', ascending=False) \
    .to_csv('../data/annotations/questions/pubmed-sample-clusters-raw.csv', index=False)


## cs-cl

In [11]:
questions = []
questions += generate_questions('../output/cs-cl/Meta-Llama-3.1-8B-Instruct/summaries/temperature0.3-0.json', 'cs-cl')
questions += generate_questions('../output/cs-cl/Mistral-7B-Instruct-v0.3/summaries/temperature0.3-0.json', 'cs-cl')
questions += generate_questions('../output/cs-cl/gpt-4o-2024-08-06/summaries/temperature0.3-0.json', 'cs-cl')

Load existing questions: ../output/cs-cl/Meta-Llama-3.1-8B-Instruct/discord-qa-contrastive/questions.json
Load existing questions: ../output/cs-cl/Mistral-7B-Instruct-v0.3/discord-qa-contrastive/questions.json
Load existing questions: ../output/cs-cl/gpt-4o-2024-08-06/discord-qa-contrastive/questions.json


In [12]:
embeddings, assignments = cluster_questions(questions, min_cluster_size=15)
centroids = calculate_centroids(questions, embeddings, assignments)
df_agg = aggregate_data(questions, assignments, centroids)

In [13]:
generate_report(df_agg)

Total generated questions: 1807
Number of clusters: 36
Classified as noise: 541 (29.94%)
|   cluster_id | centroid                                                                    |   cluster_size |
|-------------:|:----------------------------------------------------------------------------|---------------:|
|           23 | What are the main approaches or techniques discussed in the document?       |             96 |
|           12 | What recent advancements or innovations are highlighted in the document?    |             92 |
|           30 | What improvements or contributions do the proposed methods make?            |             88 |
|           15 | What main topic is the document addressing?                                 |             71 |
|           11 | What are the new approaches or methods proposed to address the challenges?  |             55 |
|            3 | What is the main focus of the research?                                     |             49 |
|           33 

In [14]:
df_agg.to_json('../data/annotations/questions/cs-cl-clusters-raw.json', orient='records')
df_agg.reset_index()[['cluster_id', 'centroid','cluster_size']].sort_values('cluster_size', ascending=False).to_csv('../data/annotations/questions/cs-cl-clusters-raw.csv', index=False)

# Convert manual annotations into standard format

In [51]:
df = pd.read_excel('../data/annotations/questions/pubmed-sample-clusters-manual.xlsx', sheet_name='clustered')
df = df[df['selected'] == 'x']
df = df[['cluster_id', 'centroid', 'group_id', 'group_name']]
df.to_json('../output/pubmed-sample/discord_questions.json', orient='records', indent=4)
display(df)

Unnamed: 0,cluster_id,centroid,group_id,group_name
1,1,What is the main focus of the study?,1,Topic
3,10,Which patient population is the study concerne...,2,Population
4,9,What condition is being addressed in the study?,3,Population -- Condition
5,18,What is the participant demographic or charact...,4,Population -- demographics
6,36,What was the main intervention used in the study?,5,Intervention
10,31,What are the significant benefits of the inter...,6,Intervention -- benefits
11,6,What are the specific biological markers influ...,7,Intervention -- biomarkers
12,34,What specific treatments were compared in the ...,8,Comparators
14,23,What specific metrics or outcomes were measured?,9,Outcomes
16,28,What are secondary outcomes noted in the study?,10,Outcomes -- Secondary


In [52]:
df = pd.read_excel('../data/annotations/questions/cs-cl-clusters-manual.xlsx', sheet_name='clustered')
df = df[df['selected'] == 'x']
df = df[['cluster_id', 'centroid', 'group_id', 'group_name']]
df.to_json('../output/cs-cl/discord_questions.json', orient='records', indent=4)
display(df)

Unnamed: 0,cluster_id,centroid,group_id,group_name
1,15,What main topic is the document addressing?,1,Main topic
6,23,What are the main approaches or techniques dis...,2,Main techniques
9,12,What recent advancements or innovations are hi...,3,Recent advances
10,8,How does the study relate to previous research...,4,Relate to prior -- comparison
12,20,Which previous works or studies are referenced?,5,Relate to prior -- references
13,25,What is a prominent method mentioned for enhan...,6,Current SOTA
15,14,What challenge or gap is identified in the res...,7,Existing limitations
21,30,What improvements or contributions do the prop...,8,Proposal -- contributions
22,11,What are the new approaches or methods propose...,9,Proposal -- method
28,27,What are the main methods or techniques evalua...,10,Evaluation - methods


In [53]:
df = pd.read_excel('../data/annotations/questions/astro-ph-clusters-manual.xlsx', sheet_name='clustered')
df = df[df['selected'] == 'x']
df = df[['cluster_id', 'centroid', 'group_id', 'group_name']]
df.to_json('../output/astro-ph/discord_questions.json', orient='records', indent=4)
display(df)

Unnamed: 0,cluster_id,centroid,group_id,group_name
1,0,What is the main focus of the study?,1,Topic - main
3,10,What specific phenomena or processes are being...,2,Topic - phenomena processes
4,17,What broader context or field does the study c...,3,Topic -- context
5,3,What specific challenges or limitations does t...,4,Challenges and limitations
7,8,What methodology or techniques are employed in...,5,Method
9,6,What comparisons are made within the study?,6,Method -- comparisons
10,15,What are the main findings of the study?,7,Results -- overall
12,14,What detailed evidence or data is used to supp...,8,Results -- evidence
13,12,What specific variables or conditions are cruc...,9,Results -- parameters conditions
14,18,How do the findings relate to existing models ...,10,Discussion -- existing theories


In [54]:
df = pd.read_excel('../data/annotations/questions/qmsum-generic-clusters-manual.xlsx', sheet_name='clustered')
df = df[df['selected'] == 'x']
df = df[['cluster_id', 'centroid', 'group_id', 'group_name']]
df.to_json('../output/qmsum-generic/discord_questions.json', orient='records', indent=4)
display(df)

Unnamed: 0,cluster_id,centroid,group_id,group_name
1,9,Who are the participants and their roles discu...,1,Participants -- roles
3,4,What main topic was discussed in the meeting?,2,Topic -- main
6,2,What were the main objectives or goals discuss...,3,Topic -- goals
7,10,Which aspects of the main topic were covered i...,4,Topic -- aspects
9,15,What are the identified challenges or concerns...,5,Discussion -- challenges
11,14,What detailed strategies or solutions were pro...,6,Discussion -- solutions
12,7,What were the anticipated impacts or implicati...,7,Discussion -- solution impacts
13,17,What were the major outcomes or decisions made...,8,Outcomes -- decisions
14,3,What collaborative efforts or partnerships wer...,9,Outcomes -- collaborations
15,16,What potential future steps or actions were pl...,10,Outcomes -- actions
