# Document Clustering and Summarization Pipeline

## Overview

In this notebook, we continue from the **JSON file extracted in the first notebook**.  

We begin by **cleaning the text**, removing noise and irrelevant content. Once preprocessed, the text is **split into paragraphs** to facilitate more granular analysis.  

We then **embed the paragraphs**, apply **UMAP** for dimensionality reduction, and use **HDBSCAN** to **cluster similar claims** based on their semantic similarity.  

To **evaluate the quality of the resulting clusters**, we define two metrics.  

Finally, for each cluster of documents, we **prompt a large language model (LLM)** to generate a **representative title** that summarizes the content of the cluster.

## Configuration

At the beginning of the notebook, update the **variables** and **path definitions** to specify the input data, model configuration, and output directories used throughout the workflow.
|

In [None]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import torch
import re
import pandas as pd
from nltk import sent_tokenize
import json 

# Input and output folder

In [None]:
import json
from pathlib import Path


openai_key = ""

file_name = "Afghanistan_Afghanistan Floods-Week 21 2024"

# Input folders
folder_sources_metadata_path = Path("Results/Sources/SourcesCountryEvent-Metadata/Dev set/")
folder_sources_path = Path("Results/Sources/SourcesCountryEvent/Dev set/")


# Output folder
folder_metadata_paragraphs = Path("Results/paragraphs_metadata/")
folder_cluster_path = Path("Results/Cluster/Clusters")
folder_cluster_headline_path = Path("Results/Cluster/Clusters+Headline ")


# Full file paths (adding .json)
sources_path = folder_sources_path / f"{file_name}.json"
sources_metadata_path = folder_sources_metadata_path / f"sources-metadata-{file_name}.json"

# Read JSON files
sources_text = json.loads(sources_path.read_text(encoding='utf-8'))
sources_metadata = json.loads(sources_metadata_path.read_text(encoding='utf-8-sig'))



# Text processing 

In [None]:
def process_text(text):
    text = text.replace('\\xe2\\x80\\x9c', '"')
    text = text.replace('\\xe2\\x80\\x9d', '"')
    text = text.replace('\\xe2\\x80\\x98', "'")
    text = text.replace('\\xe2\\x80\\x99', "'")
    text = text.replace('\u201c', '"')
    text = text.replace('\u201d', '"')
    text = text.replace('\u2019', "'")
    text = text.replace('\\xc2\\xa0', " ")
    text = text.replace('\\"', '\"')
    text = text.replace("\\'", "\'")
    text = text.replace("\\xe2\\x80\\x94", "-")
    text = text.replace("\\xe2\\x80\\x93", "-")
    text = text.replace("\n", "")
    text = text.replace("\uf0b7", "")
    text = text.replace("\t", " ")
    text = re.sub(r'[^a-zA-Z0-9\s\.\,\!\?\;\:\'\"-]', '', text)
    text = text.replace("\xa0", "")
    text = re.sub(r"\.{4,}", '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text )
    

    return text


In [None]:


def get_sentences(text):
    """
    Tokenizes a text into sentences and remove the short sentecences.
    """
    # Split the text into sentences
    sentences = sent_tokenize(text)
    
    # Remove sentences that are too short (less than 4 words)
    
    
    return sentences

def remove_bad_sentences(text):
    sentences = sent_tokenize(text)
    sentences = [s for s in sentences if len(s.split()) >= 4 ]
    
    filtered_sentences = []
    
    for sentence in sentences:
        # Check if any word in the sentence is longer than 35 characters
        if any(len(word) > 35 for word in sentence.split()):
            continue
        
        # Check if the sentence contains three or more periods
        if sentence.count('.') >= 3:
            continue
        
        # If both conditions are passed, add to the filtered list
        filtered_sentences.append(sentence)
    
    return filtered_sentences


# Split in paragraphs and add metadata to the paragraphs 

In [None]:


# Ensure the output directory exists
folder_metadata_paragraphs.mkdir(parents=True, exist_ok=True)

for source_file in folder_sources_path.iterdir():
    # Skip non-JSON files and metadata files
    if not source_file.name.endswith(".json") or source_file.name.startswith("sources-metadata"):
        continue

    base_name = source_file.stem  # file name without .json
    metadata_file = folder_sources_metadata_path / f"sources-metadata-{base_name}.json"

    # Skip if metadata file doesn't exist
    if not metadata_file.exists():
        print(f"Metadata file missing for {source_file.name}. Skipping...")
        continue

    # Load source and metadata files
    with source_file.open("r", encoding="utf-8") as f:
        sources_text = json.load(f)

    with metadata_file.open("r", encoding="utf-8-sig") as f:
        sources_metadata = json.load(f)

    full_paragraphs_metadata = []

    for key, data in sources_metadata.items():
        title = data['title']
        content = data['content']
        url = data['url']

        sentences = get_sentences(content)
        sentences = remove_bad_sentences(content)

        # Split sentences into paragraphs of 4 sentences each
        paragraphs = [" ".join(sentences[i:i+4]) for i in range(0, len(sentences), 4)]

        for paragraph in paragraphs:
            full_paragraphs_metadata.append({"title": title, "url": url, "paragraph": paragraph})

    # Save metadata to output folder
    output_file = folder_metadata_paragraphs / f"metadata-{base_name}.json"
    with output_file.open("w", encoding="utf-8") as f:
        json.dump(full_paragraphs_metadata, f, indent=2)

    print(f"Saved metadata for {source_file.name} to {output_file}")


# Embedding

In [None]:

def embed(model, sentences):
    """
    wrapper function for generating embeddings
    """
    embeddings = model.encode(sentences)
    
    return embeddings


In [None]:
import umap
import hdbscan
def generate_clusters(embeddings,
                      n_neighbors,
                      min_cluster_size,
                      min_samples, 
                      cluster_selection_epsilon, 
                      random_state = None):
    """
    Generate HDBSCAN cluster object after reducing embedding dimensionality with UMAP
    """
    
    umap_embeddings = (umap.UMAP(n_neighbors=n_neighbors, 
                                n_components= 10, 
                                metric='cosine', 
                                random_state=random_state)
                            .fit_transform(embeddings))

    clusters = hdbscan.HDBSCAN(min_cluster_size = min_cluster_size, 
                               min_samples = min_samples,
                               cluster_selection_epsilon=cluster_selection_epsilon,
                               metric='euclidean', 
                               cluster_selection_method='eom', gen_min_span_tree=True).fit(umap_embeddings)
    
    

    return clusters

In [None]:
def display_clusters(sentences, clusters):
    cluster_labels = clusters.labels_
    unique_labels = set(cluster_labels)
    
    cluster_dict = {}
    for label in unique_labels:
        cluster_dict[label] = []

    for sentence, label in zip(sentences, cluster_labels):
        cluster_dict[label].append(sentence)
    
    result_dict = {}
    for label, cluster_sentences in cluster_dict.items():
        result_dict[label] = cluster_sentences
        
    return result_dict

In [None]:
def get_space_parameters(nsentences):

    
   
    n_neighbors = range(3, 30, 3)  
    min_cluster_size = range(3, 30, 3) 
    min_samples = range(2, 20, 2)  
    cluster_selection_epsilon = [ 0.05, 0.1, 0.15, 0.2 ]  
    return {
        "n_neighbors": n_neighbors,
        "min_cluster_size": min_cluster_size,
        "min_samples": min_samples,
        "cluster_selection_epsilon": cluster_selection_epsilon,
        "random_state": 101
    }


# Evaluation 

In [None]:

def dbcv_score(clusters, prob_threshold = 0.05):
    """
    Returns the label count and cost of a given cluster supplied from running hdbscan
    """
    
    cluster_labels = clusters.labels_
    label_count = len(np.unique(cluster_labels))
    # total_num = len(clusters.labels_)
    # cost = (np.count_nonzero(clusters.probabilities_ < prob_threshold)/total_num)
    
    cost = clusters.relative_validity_
    return label_count, cost


In [None]:
import openai 
def evaluate_cluster_LLM(cluster, openaikey, temperature=0.1):
    prompt =f"""
    Evaluate the coherence and homogeneity of the following cluster of humanitarian documents. 
    These documents need to be clustered in a way that supports the effective generation of questions, ensuring that the cluster is logically consistent and semantically relevant.

    **Evaluation Criteria**:
    - **Coherence**: How logically connected are the items in the cluster? Do the items form a consistent and sensible narrative when taken together?
    - **Homogeneity**: How similar or uniform are the items in the cluster in terms of topic, content, and style? Are there any outliers that introduce dissimilarity?

    **Evaluation Process**:
    1. **Evaluate Coherence**:
       Assess how logically connected and internally consistent the items are. Do the items follow a clear and consistent line of thought or theme? Is there any inconsistency or contradiction within the cluster?

    2. **Evaluate Homogeneity**:
       Determine the degree of similarity or uniformity within the cluster. Are the items highly similar to each other in terms of subject matter, style, or content? If there are significant differences, how do they affect the overall homogeneity?

    3. **Coherence and Homogeneity Scores**:
       Based on your evaluation of both coherence and homogeneity, assign a score for each aspect on a scale from 0 to 1:
       - **0** means poor coherence/homogeneity (e.g., the items are disconnected or highly dissimilar).
       - **1** means excellent coherence/homogeneity (e.g., the items are logically connected and highly similar).
       - A score in between reflects varying degrees of coherence/homogeneity.

    Cluster:
    {cluster}

    Please compute the following:
    
    Coherence score (0 to 1) 
    Homogeneity score (0 to 1)
    and return just the mean of the scores, not other words. 
    
    """
    client = openai.OpenAI(api_key= openaikey)

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=10,
        temperature=temperature
    )
    
    # Extract the model's score
    score_text = response.choices[0].message.content
    
    return score_text

In [None]:
def evaluate_multiple_clusters_LLM(sentences, clusters):
    dict_clsuters = display_clusters(sentences, clusters)
    scores = np.zeros(len(dict_clsuters))
    for key in dict_clsuters.keys():
    
       
           scores[key] = evaluate_cluster_LLM(dict_clsuters[key], openaikey=openai_key)
           
    print(f"mean value of the scores for each cluster is: {np.mean(scores)}")      
    
    return np.mean(scores)


In [None]:
import random
from tqdm import trange


def random_search_V2(sentences, embeddings, space, num_evals):
    """
    Randomly search hyperparameter space a limited number of times 
    and return a summary of the results
    New version with the different prompt. 
    """
    #random.seed(101)
    results = []
    
    for i in trange(num_evals):
        n_neighbors = random.choice(space['n_neighbors'])
        min_cluster_size = random.choice(space['min_cluster_size'])
        min_samples = random.choice(space['min_samples'])
        cluster_selection_epsilon = random.choice(space['cluster_selection_epsilon'])

        clusters = generate_clusters(embeddings, 
                                     n_neighbors=n_neighbors, 
                                     min_cluster_size=min_cluster_size, 
                                     min_samples=min_samples,
                                     cluster_selection_epsilon=cluster_selection_epsilon, 
                                     random_state=101)
    
        label_count, dbcv = dbcv_score(clusters, prob_threshold=0.05)
        
        
        llm_score = evaluate_multiple_clusters_LLM(sentences, clusters)
        #llm_score = 1
        cost = (dbcv + llm_score) /2. 
        print(f"Final score of the cluster {cost}")
        
        results.append([i, n_neighbors, min_cluster_size, min_samples, cluster_selection_epsilon,
                        label_count, dbcv, llm_score, cost])
    
    result_df = pd.DataFrame(results, columns=['run_id', 'n_neighbors', 
                                               'min_cluster_size', 'min_samples', 'cluster_selection_epsilon', 
                                               'label_count','dbcv', 'llm_score', 'final_cost'])
    result_df = result_df.sort_values(by='final_cost', ascending=False)

    best_params = result_df.iloc[0]

    # Extract the best parameters as a dictionary
    best_params_dict = {
        'n_neighbors': best_params['n_neighbors'],
        'min_cluster_size': best_params['min_cluster_size'],
        'min_samples': best_params['min_samples'],
        'cluster_selection_epsilon': best_params['cluster_selection_epsilon'],                                 
        'label_count': best_params['label_count'],
        "dbcv": best_params['dbcv'],
        #"fraction_goodcluster": best_params['fraction_goodcluster'],
        'final_cost': best_params['final_cost']
    }

    return result_df, best_params_dict

# Text cleaning and paragraph formation 

In [None]:
from transformers import pipeline 

full_paragraphs = []
full_sentences = []

full_paragraphs_metadata = []

for key in sources_metadata.keys():
    data = sources_metadata[key]
    title= data['title']
    content = data['content']
    url = data['url']

#for text in pdf_texts:
#print(text)
    processed_text = process_text(content)
        
    sentences = get_sentences(processed_text)
    sentences = remove_bad_sentences(processed_text)

    paragraphs = [" ".join(sentences[i:i+4]) for i in range(0, len(sentences), 4)]

    for paragraph in paragraphs:
        full_paragraphs_metadata.append({"title": title, "url": url, "paragraph": paragraph})
    

    # Add the sentences to the full_sentences list
    full_sentences.extend(sentences)
    full_paragraphs.extend(paragraphs)
    
    



# Cluster Pipeline

In [None]:
from sentence_transformers import SentenceTransformer

model_embedding = SentenceTransformer("nomic-ai/modernbert-embed-base")

#model_embedding = SentenceTransformer('BAAI/bge-large-zh-v1.5')
embedding = embed(model_embedding, full_paragraphs)

continue_loop = True
while continue_loop == True:
    spcace_parameters = get_space_parameters(nsentences = len(full_paragraphs))

    costs_rdnsearch, best_params = random_search_V2(full_paragraphs, embedding, spcace_parameters, 1)
    print(f"number of clusters {costs_rdnsearch.iloc[0]['label_count']}")
    if costs_rdnsearch.iloc[0]['label_count'] >= 6: continue_loop = False
    
clusters = generate_clusters(embedding, 
                                    n_neighbors = int(best_params['n_neighbors']), 
                                    min_cluster_size = int(best_params['min_cluster_size']), 
                                    min_samples = int(best_params['min_samples']),
                                    cluster_selection_epsilon = float(best_params['cluster_selection_epsilon']),                              
                                    random_state = 101)

In [None]:
costs_rdnsearch

In [None]:
display_clusters(full_paragraphs, clusters)

In [None]:
os.makedirs(f"{folder_cluster_headline_path}/params-clusters-txt", exist_ok=True)
costs_rdnsearch.iloc[0].to_csv(f"{folder_cluster_headline_path}/params-clusters-txt/{file_name}.csv")

In [None]:
clusters_dict = display_clusters(full_paragraphs, clusters)

In [None]:
clusters_dict

In [None]:
import json
#clusters_dict = display_clusters(full_paragraphs, clusters)


clusters_dict = {int(k): v for k, v in clusters_dict.items()}


#cluster_path = f"./Results/Clusters/clusters-{week}-{sector}.json"
os.makedirs(f"{folder_cluster_path}", exist_ok=True)
cluster_path = f"{folder_cluster_path}/cluster-{file_name}.json"

with open(cluster_path, "w") as f: 
    json.dump(clusters_dict, f)

    


# Import cluster json 

In [None]:

with open(cluster_path, "r") as f:
    clusters_data = json.load(f)

# Headline generation for clusters

In [None]:
import openai
def generate_headline(cluster, openaikey, temperature = 0.1):
    

    prompt = f"""
You will receive a cluster of sentences, produce me a title that best describe the cluster. 
The text are taken from huminatarian sources. 
Only return the title, no additional text. Here is the cluster: 
{cluster}

"""
    client = openai.OpenAI(api_key= openaikey)

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=35,
        temperature=temperature
    )
    
    # Extract the model's score
    response = response.choices[0].message.content.strip()
    
    return response

In [None]:
#cluster_data = display_clusters(full_paragraphs, clusters)
output_dict = {}
for cluster_num, sentences in clusters_data.items():
    headline = generate_headline(sentences, openai_key)
    print("Headline generated correctly")
    cluster_info = {
        "cluster_articles": [{"id": f"article_id_{cluster_num}.txt", "text": sentence} for sentence in sentences],
        "cluster_headline": headline
    }
    output_dict[int(cluster_num)] = cluster_info

In [None]:
output_dict

# Headline + cluster saving 

In [None]:
import json 
#cluster_headline_path = f"./Results/Clusters+Headline /clusters-headlines-{week}-{sector}.json"
cluster_headline_path = f"{folder_cluster_headline_path}/clusters-{file_name}.json"

with open(cluster_headline_path, 'w') as handle:
    json.dump(output_dict, handle)
    



In [None]:

with open(cluster_headline_path, "r") as f:
        headline_data = json.load(f)