In [19]:
import json
import sentence_transformers
import torch
import collections
import igraph
import random
import spacy

Create text

In [20]:
file_path = "example_data.json"
with open(file_path, "r") as file:
    data = json.load(file)

In [21]:
# Initialize an empty list to store processed text and metadata
text_l = []

# Iterate over each row in the data, keeping track of the index (i)
for i, row in enumerate(data):
    d = row['metadata']  # Extract the 'metadata' dictionary from the row

    # Check if the 'language' key exists and contains 'eng' (English)
    if d.get('language') and 'eng' in d['language']:  
        
        # If 'subject' exists, format the text with title, description, and keywords
        if d['subject']:
            text = d['title'] + '[SEP]' + d['description'] + ' Keywords: ' + ', '.join(d['subject']) + '.'
        else:
            # If no subject, only use title and description
            text = d['title'] + '[SEP]' + d['description']

        # Create a tuple with index, identifier, and title for reference
        text_info = (i, d['identifier'], d['title'])

        # Append the processed text and its metadata tuple to the list
        text_l.append((text, text_info))

text_l[:2]

[('Forum Bildverarbeitung 2024 = Image Processing Forum 2024[SEP]Image processing combines the disciplines of cameras – image-based sensors – with the processing of the sensor data – the images. From this follows the particular attraction of this field. The conference proceedings at hand of the “Image Processing Forum”, which took place on 21.-22.11.2024 in Karlsruhe as a common event of the Karlsruhe Institute of Technology and the Fraunhofer Institute of Optronics, System Technologies and Image Exploitation, contain the articles of the contributions. Keywords: Bildverarbeitung, automated visual inspection, automatische Sichtprüfung, bildgestützte Messtechnik, image processing, image-based measurement technology, machine learning, machine vision, maschinelles Lernen.',
  (0,
   'https://library.oapen.org/handle/20.500.12657/95809',
   'Forum Bildverarbeitung 2024 = Image Processing Forum 2024')),
 ('Marana Dyargali[SEP]Marana Dyargali (marana - first; dyargali - mark/etch/scratch) is 

Create embeddings

In [22]:
# Load the pre-trained SPECTER model for generating sentence embeddings
model = sentence_transformers.SentenceTransformer('allenai-specter')

# Encode the text data from text_l (extracting only the first element of each tuple, which is the text)
embeddings = model.encode([x[0] for x in text_l])

# Convert the embeddings into a PyTorch tensor
embeddings = torch.tensor(embeddings)

# Move the tensor to the CPU (ensures compatibility with systems without a GPU)
embeddings = embeddings.to("cpu")

# Normalize the embeddings to have unit length (improves similarity computations)
embeddings = sentence_transformers.util.normalize_embeddings(embeddings)

In [23]:
def remove_self_matches(doc_matches):
    """
    Adjust sublists within a dictionary of sublists keyed by 'corpus_id'. Each sublist is expected
    to initially contain 21 elements. After filtering the self-match, if there are more than 20 elements, 
    the last element is removed to ensure there are exactly 20.
    
    :param doc_matches: A list of lists, where each sublist contains dictionaries with keys 'corpus_id' and 'score'.
    :return: A dictionary with adjusted sublists, keyed by 'corpus_id'.
    """
    new_doc_matches = {}
    for index, sublist in enumerate(doc_matches):
        # Filter out the dictionary where corpus_id matches the index (self match)
        filtered_sublist = [entry for entry in sublist if entry['corpus_id'] != index]

        # If the filtered sublist is longer than 20, remove the last element
        if len(filtered_sublist) > 20:
            filtered_sublist = filtered_sublist[:20]

        # Use the index (which is the corpus_id for the query document) as the key in the dictionary
        new_doc_matches[index] = filtered_sublist
    return new_doc_matches

In [24]:
top_k = 21
hits = sentence_transformers.util.semantic_search(embeddings, embeddings, top_k=top_k)
hits = remove_self_matches(hits)

Create network

In [25]:
sum_d = collections.Counter()
for index_node_1, hit_d_l in hits.items():
    for hit_d in hit_d_l:
        edge_weight = hit_d['score']
        index_node_2 = hit_d['corpus_id']
        iedge_name = tuple(sorted([index_node_1, index_node_2]))
        sum_d[iedge_name] += edge_weight

In [26]:
def create_Network(nodeweight_l, iedgeid_l, iedgeweight_l):
    #nodeid must be the same as node index in nodeweight_l
    ig_network = igraph.Graph(directed=False)
    ig_network.add_vertices(len(nodeweight_l))
    ig_network.vs['weight'] = nodeweight_l
    ig_network.vs['name'] = [v.index for v in ig_network.vs]
    ig_network.add_edges(iedgeid_l)
    ig_network.es['weight'] = iedgeweight_l
    return ig_network

In [27]:
iedge_l, edgeweight_l = zip(*sum_d.items())
ig_network = create_Network([1 for x in hits.keys()], iedge_l, edgeweight_l)

In [32]:
# igraph.plot(ig_network)
# dir(ig_network)
for i in ig_network.es:
   print(i)

igraph.Edge(<igraph.Graph object at 0x337c14f50>, 0, {'weight': 1.4690345525741577})
igraph.Edge(<igraph.Graph object at 0x337c14f50>, 1, {'weight': 0.7251708507537842})
igraph.Edge(<igraph.Graph object at 0x337c14f50>, 2, {'weight': 0.7205752730369568})
igraph.Edge(<igraph.Graph object at 0x337c14f50>, 3, {'weight': 0.7125038504600525})
igraph.Edge(<igraph.Graph object at 0x337c14f50>, 4, {'weight': 0.7057288885116577})
igraph.Edge(<igraph.Graph object at 0x337c14f50>, 5, {'weight': 0.6953625082969666})
igraph.Edge(<igraph.Graph object at 0x337c14f50>, 6, {'weight': 0.6934532523155212})
igraph.Edge(<igraph.Graph object at 0x337c14f50>, 7, {'weight': 0.6771689653396606})
igraph.Edge(<igraph.Graph object at 0x337c14f50>, 8, {'weight': 0.6768426895141602})
igraph.Edge(<igraph.Graph object at 0x337c14f50>, 9, {'weight': 0.6659663319587708})
igraph.Edge(<igraph.Graph object at 0x337c14f50>, 10, {'weight': 0.6604200601577759})
igraph.Edge(<igraph.Graph object at 0x337c14f50>, 11, {'weight':

Create clustering

In [33]:
def get_Partition_Class(ig_network, resolution, random_seed=0, node_weights=None, weights=None, n_iterations=2):
    """Creates an Igraph representation of the network

    Parameters
    ----------
    ig_network : igraph.Graph object
        Igraph representation of the network with edge weight = 1.
        
    resolution: float
        Resolution to be used in the Leiden algorithm clustering.
        
    random_seed: int, optional
        Random seed of the Leiden algorithm clustering.
        
    Returns
    -------
    partition : igraph.clustering.VertexClustering object
        Partition of the nodes into clusters acording to the Leiden algorithm. The clustering is hard-clustering and it can contain singletons.

    Notes
    -------
    Requires the igraph module.
    Requires the random module
    The purpose of the function is to fix the random seed of the Leiden algorihm so the results of the clustering
    become replicable.
    """
    igraph.set_random_number_generator(random)
    random.seed(random_seed)
    partition = ig_network.community_leiden(resolution=resolution, node_weights=node_weights, weights=weights, n_iterations=n_iterations)
    return partition

import copy

def join_Clusters(clu_d, con_d, n_desired, resolution): # No more need for the resolution argument, discontinue in the future
    """Creates dictionary of joined clusters

    Parameters
    ----------
    clu_d : dict of tuple
        Dictionary where the first level is the name of the cluster and the second level is the list of the name of the nodes in the cluster.
    
    con_d: dict of dict of int
        Dictionary where the first level is the name of a given cluster, the second level is the name of another given cluster and the third level
        is the number of edges between the clusters.
    
    n_desired: int
        How many clusters you want to have after the joining process.
    
    resolution: float
        Resolution to be used in the Leiden algorithm clustering. It is used to calculate the score of the connections between clusters.
    
    Returns
    -------
    merge_dict: dict of dict
        Dictionary with the output of the merging.
        Keys:
        
        jclu_d: dict of tuple
            Dictionary where the first level is the name of the cluster and the second level is the list of the name of the nodes in the cluster.
            The clusters contain the nodes after mergin. The number of cluster is the desired number of clusters (n_desired).
        
        jrem_d: dict of tuple
            Dictionary where the first level is the name of the cluster and the second level is the list of the name of the nodes in the cluster.
            The clusters contain the nodes after mergin. When a cluster can't be merged, it is placed in this dictionary. The conditions for not merging a
            cluster are: It is not in the connections dict or the number of connections to other clusters sum 0.
        
        jcon_d: dict of dict
             Dictionary where the first level is the name of a given cluster, the second level is the name of another given cluster and the third level
            is the number of edges between the clusters. It contains the connections between the merged clusters. All the clusters are included, and if
            they have no connections, then the connections number is 0.

    Notes
    -------
    Requires de copy module.
    The purpose of the function is to merge the clusters until you have a desired number of clusters.
    The merging process is:
        1- If you have enought clusters, stop.
        2- Identify the smallest cluster.
        3- Calculate the connectivity score between the smallest cluster and the other clusters.
        4- Merge the smallest cluster with the cluster with the best connectivity score.
        5- Go to step 1.
    There are 2 optimized steps in this function:
        A- Identify the smallest cluster: To identify the smallest cluster, I sort a list of the sizes of the clusters. However, the size of the clusters
        changes after merging, so I have create a list of the sizes of the clusters and sort it after each merging. The optimization is that most of the 
        time the smallest cluster of the list will be the same before and after merging. This happens because the clusters usually merge with some of the
        bigger clusters, therefore the size of the smaller clusters tend to not change. This is relevant because the vast mayority of the clusters are very
        small. I take advantage if this fact in the get_Smallest_Cluster function. It uses te ranking and jclu_s variables to chec if the size of the cluster
        to merge is the same since the last time the list was sorted. Because the clusters can't get smaller, the fact that the cluster to merge is the same
        size as the last time the list was sorted means that it is time to merge the cluster to merge. If the cluster to merge is not of the same size, it
        meanse that there may be another smaller cluster in the list and so the list of clusters sizes has to be created again and sorted to find the smallest
        cluster. This list is now the one that will be used in the following evaluations of the cluster to merge.
        B- Calculate the connectivity score between the smallest cluster and the other clusters: I already calculated the number of conections between 
        the clusters once in the con_d variable. Therefore, to not calculate the number of conections again, I add the conections with the other clusters of 
        the smaller merged to the connections with the other clusters of the bigger merged cluster. I have to be carfull to not add self-connections.
    c_m: Name of the cluster to merge
    c_b: Name of the cluster with the best score
    c_m_c: Name of a given cluster that is also conected to c_m
    jclu_s: Dictionary of the size of the clusters. The purpose of this variable is to save the size of the clusters so to not calculate them again each time they are needed.
    """
    jclu_d = copy.deepcopy(clu_d)
    jcon_d = copy.deepcopy(con_d)
    jrem_d = {}
    jclu_s = {c: len(jclu_d[c]) for c in jclu_d}  # clusters size list, used for finding the smallest cluster in an optimized way, see get_Smallest_Cluster function
    ref_jclu_s = dict_As_Sorted_Tuples(jclu_s) # Reference size list, used for finding the smallest cluster in an optimized way, see get_Smallest_Cluster function
    while len(jclu_d) > n_desired:  # Main loop, stops once you have the number of desired clusters
        ref_jclu_s, c_m = get_Smallest_Cluster(ref_jclu_s, jclu_s)  # Which is the smallest cluster? (Optimized)
        if c_m in jcon_d:  # If the cluster to merge is in the connections dictionary, procede, else, add the cluster to merge to the rem_d
            score_d = {}
            for c_m_c in jcon_d[c_m]:
                n_con = jcon_d[c_m][c_m_c]
                if n_con != 0:  # The importance of this line is that merging with no conections may have better scores than mergins with some conections, and I want to avoid that by omiting clusters with no conections
                    score_d[c_m_c] = get_Merging_Resolution(n_con, jclu_s[c_m], jclu_s[c_m_c])
            if len(score_d) != 0:  # If the cluster to merge has any connection to the other clusters, procede, else, add the cluster to merge to the rem_d
                c_b = max_Key_By_Value(score_d)
                jclu_d, jclu_s, jcon_d = upd_Merge(c_m, c_b, jclu_d, jclu_s, jcon_d)  # Merge the clusters
            else:
                jclu_d, jclu_s, jcon_d, jrem_d = upd_Remove(c_m, jclu_d, jclu_s, jcon_d, jrem_d)  # This line removes the cluster
        else:
            jclu_d, jclu_s, jcon_d, jrem_d = upd_Remove(c_m, jclu_d, jclu_s, jcon_d, jrem_d)  # This line removes the cluster
        del(ref_jclu_s[0])  # Remove the cluster from the ranking list
    jcon_d = clean_Con_D(jcon_d, jclu_d)
    merge_dict = {'jclu_d': jclu_d, 'jrem_d': jrem_d, 'jcon_d': jcon_d}
    return merge_dict

def upd_Remove(c_m, jclu_d, jclu_s, jcon_d, jrem_d):
    """Updates the dictionaries when you have to remove a cluster

    Parameters
    ----------
    c_m : int
        Name of the cluster to merge
    
    jclu_d: dict of tuple
        Dictionary of clusters merged and clusters to merge where the first level is the name of the cluster and the second level
        is the list of the name of the nodes in the cluster.
        
    jclu_s: dict of int
        Dictionary of the size of the clusters

    jcon_d: dict of dict
        Dictionary where the first level is the name of a given cluster, the second level is the name of another given cluster and the third level
        is the number of edges between the clusters.
        
    jrem_d: dict of tuple
        Dictionary of removed clusterswhere the first level is the name of the cluster and the second level is the list of the name 
        of the nodes in the cluster.
    
    Returns
    -------
    jclu_d: dict of tuple
        Updated jclu_d parameter.
        
    jclu_s: dict of int
        Updated jclu_s parameter.
    
    jcon_d: dict of dict
        Updated jcon_d parameter.
    
    jrem_d: dict of tuple
        Updated jrem_d parameter (c_m cluster name and nodes added).
        
    Notes
    -------
    It removes the values of c_m from jclu_s and jclu_d, but before removing the values of c_m from jcon_d it checks if the c_m exists in jcon_d. It also removes the c_m conection from the value of other clustes conected to c_m.
    c_m_c: Name of a given cluster that is also conected to c_m
    """
    jrem_d[c_m] = jclu_d[c_m]
    del(jclu_s[c_m])
    del(jclu_d[c_m])
    if c_m in jcon_d:
        for c_m_c in jcon_d[c_m]:
            del(jcon_d[c_m_c][c_m]) # Remove c_m conection from the value of c_m_c
        del(jcon_d[c_m])  # Remove c_m value
    return jclu_d, jclu_s, jcon_d, jrem_d

def upd_Merge(c_m, c_b, jclu_d, jclu_s, jcon_d):
    """Updates the dictionaries when you have to merge a cluster

    Parameters
    ----------
    c_m : int
        Name of the cluster to merge
        
    c_b : int
        Name of the cluster with the best conectivity score
    
    jclu_d: dict of tuple
        Dictionary of clusters merged and clusters to merge where the first level is the name of the cluster and the second level
        is the list of the name of the nodes in the cluster.
        
    jclu_s: dict of int
        Dictionary of the size of the clusters

    jcon_d: dict of dict
        Dictionary where the first level is the name of a given cluster, the second level is the name of another given cluster and the third level
        is the number of edges between the clusters.

    Returns
    -------
    jclu_d: dict of tuple
        Updated jclu_d parameter.
        
    jclu_s: dict of int
        Updated jclu_s parameter.
    
    jcon_d: dict of dict
        Updated jcon_d parameter.
    """
    jclu_d = merge_Clu_D(c_m, c_b, jclu_d)
    jclu_s = merge_Clu_S(c_m, c_b, jclu_s)
    jcon_d = merge_Con_D(c_m, c_b, jcon_d)
    return jclu_d, jclu_s, jcon_d

def merge_Clu_D(c_m, c_b, jclu_d):
    """Update jclu_d so that c_m and c_b are merged

    Parameters
    ----------
    c_m : int
        Name of the cluster to merge
        
    c_b : int
        Name of the cluster with the best conectivity score
    
    jclu_d: dict of tuple
        Dictionary of clusters merged and clusters to merge where the first level is the name of the cluster and the second level
        is the list of the name of the nodes in the cluster.

    Returns
    -------
    jclu_d: dict of tuple
        Updated jclu_d parameter.
    """
    jclu_d[c_b].update(jclu_d[c_m])
    del(jclu_d[c_m])
    return jclu_d

def merge_Clu_S(c_m, c_b, jclu_s):
    """Update jclu_s so that c_m and c_b are merged

    Parameters
    ----------
    c_m : int
        Name of the cluster to merge
        
    c_b : int
        Name of the cluster with the best conectivity score
    
    jclu_s: dict of int
        Dictionary of the size of the clusters

    Returns
    -------
    jclu_s: dict of tuple
        Updated jclu_s parameter.
    """
    jclu_s[c_b] += jclu_s[c_m]
    del(jclu_s[c_m])
    return jclu_s

def merge_Con_D(c_m, c_b, jcon_d):
    """Update jclu_s so that c_m and c_b are merged

    Parameters
    ----------
    c_m : int
        Name of the cluster to merge
        
    c_b : int
        Name of the cluster with the best conectivity score
    
    jcon_d: dict of dict
        Dictionary where the first level is the name of a given cluster, the second level is the name of another given cluster and the third level
        is the number of edges between the clusters.

    Returns
    -------
    jcon_d: dict of tuple
        Updated jcon_d parameter.
        
    Notes
    -------
    c_m_c:  Name of a given cluster that is also conected to c_m
    """
    for c_m_c in jcon_d[c_m]:  # Remove all conections (c_m_c to c_m), and add that value to the conections (c_b to c_m_c)
        if c_m_c != c_b:  # This prevents the cluster for creating a conection with itself.
            if c_m_c not in jcon_d[c_b]:  # If the conection does not already exists, create it
                jcon_d[c_b][c_m_c] = 0
                jcon_d[c_m_c][c_b] = 0
            jcon_d[c_b][c_m_c] += jcon_d[c_m][c_m_c]  # Add the value of the conection (c_m to c_m_c) to the conection (c_b to c_m_c)
            jcon_d[c_m_c][c_b] += jcon_d[c_m][c_m_c]  # Same as above
        del(jcon_d[c_m_c][c_m])  # Remove all conections (c_m_c to c_m)
    del(jcon_d[c_m])   # Remove the conections (c_b to any)
    return jcon_d

def get_Smallest_Cluster(ref_jclu_s, jclu_s):
    """Get the smallest cluster in an optimized way

    Parameters
    ----------
    ref_jclu_s : list of tuples
        Ordered list of tuples of (cluster name,  cluster size), serves as references for jclus_s. The first tuple is the smallest one.
        
    jclu_s : dict of int
        Dictionary of the size of the clusters

    Returns
    -------
    out_ref_jclu_s:  list of tuples
        Updated ref_jclu_s parameter (or the original one if nothing changed).
    
    out_c_m: int
        Name of the cluster to merge
        
    Notes
    -------
    To optimize the proces, the function tries to find the smallest cluster without sorting the sizes in jclu_s.
    To do this, it checks if the size of the cluster to merge in jclu_s is the same as it was the last time the list was sorted (i.e. ref_jclu_s)
    If it is, then there is no need to sort the list again. Otherwise, it sortes again and upgrades the refference list.
    """
    ref_c_m, ref_c_size = ref_jclu_s[0]  # Get the smallest cluster size and name acording to the reference cluster sizes list
    c_size = jclu_s[ref_c_m]  #  Get the size of the cluster from the clusters sizes dict
    if c_size == ref_c_size:  #  If the size of the cluster from the actuall cluster list is the same as the size of that cluster in the cluster list, then that is the cluster to merge, if not, you have you sort the list again to find the cluster to merge
        out_ref_jclu_s = ref_jclu_s
    else:
        out_ref_jclu_s = dict_As_Sorted_Tuples(jclu_s)
    out_c_m = out_ref_jclu_s[0][0]
    return out_ref_jclu_s, out_c_m

def get_Merging_Resolution(n_con, c_1_size, c_2_size):
    """Get the resolution at which the change in the clustering score after merging the clusters is 0

    Parameters
    ----------
    n_con : int
        Number of conections between the clusters
        
    c_1_size : int
        Size of one cluster
        
    c_2_size : int
        Size of the other cluster

        
    Returns
    -------
    resolution:  float
        The resolution at which the chnage in score is 0
        
    Notes
    -------
    This is the merging technique that uses the Leiden algorithm. You should merge the pairs of clusters where the resolution is the highest.
    The reason is that the highest resolution will be the closest one to the resolution that you are already using. The merging resolution will be
    lower than the resolution you are already using because otherwise you would had already merged the clusters.
    """
    pos_n_con = c_1_size*c_2_size # All the positble pairs of nodes in the new cluster
    resolution = n_con/pos_n_con
    return resolution

def get_Score(n_con, c_1_size, c_2_size, resolution): # Discontinued use in the current pipeline
    """Get the change in the clustering score after merging the clusters

    Parameters
    ----------
    n_con : int
        Number of conections between the clusters
        
    c_1_size : int
        Size of one cluster
        
    c_2_size : int
        Size of the other cluster
        
    resolution: float
        Resolution value of the Leiden algorithm
        
    Returns
    -------
    score:  float
        Change in the clustering score after merging the clusters
        
    Notes
    -------
    Returns the score, as used by the Leiden Algorihtm (https://onlinelibrary.wiley.com/doi/full/10.1002/asi.22748, equation 4).
    The score is calculated only for the new pairs of nodes that apear after merging the clusters. The non conected pairs add (-resolution) to the score,
    while connected pairs add (1-resolution) to the score.
    """
    pos_n_con = c_1_size*c_2_size # All the positble pairs of nodes in the new cluster
    score = n_con*(1-resolution) - (pos_n_con-n_con)*resolution  # This works the followin way: For each pair of nodes in the cluster, if they are conected then add (1-resolution), else add (0-resolution)
    return score

def clean_Con_D(jcon_d, jclu_d):
    """Makes a connection dictionary that only contains the joined clusters and and have conection values for all of them (i.e. a clean version of jcon_d)

    Parameters
    ----------
    jcon_d: dict of dict
        Dictionary where the first level is the name of a given cluster, the second level is the name of another given cluster and the third level
        is the number of edges between the clusters.
    
    jclu_d: dict of tuple
        Dictionary of clusters merged and clusters to merge where the first level is the name of the cluster and the second level
        is the list of the name of the nodes in the cluster.

    Returns
    -------
    cjcon_d: dict of dict
        Clean jcon_d.
        
    Notes
    -------
    This step is necesary so jcon_d can be used in follow up clusterings.
    """
    cjcon_d = {}
    for c_1 in list(jclu_d):  # Use the keys of jclu_d to make the keys in cjcon_d
        cjcon_d[c_1] = {}
        for c_2 in list(jclu_d):
            if c_1 != c_2:  # Make sure you are not anotation links from the cluster to itself
                if c_1 not in jcon_d:  # If the cluster is not in jcon_d, then anotate it in cjcon_d with conection value 0 to the other clusters
                    cjcon_d[c_1][c_2] = 0
                else:
                    if c_2 not in jcon_d[c_1]: # Same as above
                        cjcon_d[c_1][c_2] = 0
                    else:
                        cjcon_d[c_1][c_2] = jcon_d[c_1][c_2]
    return cjcon_d

def max_Key_By_Value(dictionary):
    """Find the maximum key in a dict acorting to its value

    Parameters
    ----------
    dictionary: dict
        A given dictionary

    Returns
    -------
    max_key: any
        The maximum key acorting to its value.
        
    Notes
    -------
    max_value: The maximum value of the dictionary
    """
    max_value = max([value for value in dictionary.values()])
    max_key = max([key for key in dictionary if dictionary[key] == max_value])
    return max_key

def dict_As_Sorted_Tuples(jclu_s):
    """Turns jclu_s into a sorted list of tuples

    Parameters
    ----------
    jclu_s : dict of int
        Dictionary of the size of the clusters

    Returns
    -------
    sorted_tuples: list of tuples
         Ordered list of tuples of (cluster name,  cluster size).

    """
    list_of_tuples = list(jclu_s.items())
    sorted_tuples = sorted(list_of_tuples, key=lambda x: x[1])
    return sorted_tuples

def c_Cluster_D(partition):
    """Creates clusters dictionary

    Parameters
    ----------
    partition : igraph.clustering.VertexClustering object.
        Partition of the nodes into clusters.

    Returns
    -------
    cluster_d : dict of tuple
        Dictionary where the first level is the name of the cluster and the second level is the list of the name of the nodes in the cluster.

    Notes
    -------
    The main purpose of this function is to get the name of the nodes.
    cluster_i = Is the cluster index obtained from enumerating the clusters list. I dont know another way of obtaining the cluster index. It becomes the de-facto cluster name.
    cluster_nodes_i = Cluster nodes indices list.
    node_i = Node index.
    """
    cluster_d = {}
    for cluster_i, cluster_nodes_i in enumerate(list(partition)):
        cluster_d[cluster_i] = set()
        for node_i in cluster_nodes_i:
            node_name = partition.graph.vs[node_i]['name']  # This is the critical part of the function
            cluster_d[cluster_i].add(node_name)
    return cluster_d

def c_Connections_D(partition):
    """Creates connections dictionary

    Parameters
    ----------
    partition : igraph.clustering.VertexClustering object
        Partition of the nodes into clusters.

    Returns
    -------
    con_d: dict of dict of int
        Dictionary where the first level is the index of a given cluster, the second level is the index of another given cluster and the third level
        is the number of edges between the clusters. It does not includes pairs of clusters with no conections. The dictionary anotates the number
        of edges for both orders of clusters (e.g. both conn_d[1][2] = 3 and conn_d[2][1] = 3). It does not anotates edges from a cluster to itself.

    Notes
    -------
    The function uses igraph.clustering.VertexClustering.cluster_graph() to efficienly obtain the number of edges between the clusters.
    The parameter cluster_graph(combine_edges=sum) sums the attributes of the edges. The atrribute of the edges is 'weight', and
    the value is '1'. Therefore, the attrribute 'weight' will tell you hom many edges there were originaly.
    The index of the clusters is the same as in the c_Cluster_D function output. It becomes the de-facto cluster name.
    c_1_i = Cluster 1 index
    c_2_i = Cluster 2 index
    """
    con_d = {}
    cluster_g = partition.cluster_graph(combine_edges=sum)  # This is the critical part of the function.
    for edge in cluster_g.es:
        c_1_i = edge.source
        c_2_i = edge.target
        n_conn = edge['weight']
        if c_1_i != c_2_i:  # Don't anotate conections from the cluter to itself.
            if c_1_i not in con_d:
                con_d[c_1_i] = {}
            if c_2_i not in con_d:
                con_d[c_2_i] = {}
            con_d[c_1_i][c_2_i] = n_conn
            con_d[c_2_i][c_1_i] = n_conn
    return con_d

In [None]:
nclusters = 5 # Number of clusters to be used in the clustering.
resolution = 0.5 # Resolution to be used in the Leiden algorithm clustering.
partition = get_Partition_Class(ig_network, resolution, n_iterations=3)
print(sorted(partition.sizes(), reverse=True)[:nclusters])
print(sum(sorted(partition.sizes(), reverse=True)[:nclusters])/len(ig_network.vs()))

[25, 17, 14, 10, 10]
0.9156626506024096


In [35]:
clu_d = c_Cluster_D(partition)
con_d = c_Connections_D(partition)
merging_data = join_Clusters(clu_d, con_d, nclusters, resolution)

Label clustes

In [37]:
nlp = spacy.load("en_core_web_sm")

def extract_noun_phrases(text, nlp=nlp):
    """
    Extract all possible noun phrases from a given text using spaCy.
    A noun phrase is defined as any sequence of nouns and adjectives that ends in a noun,
    including all sub-phrases.
    
    Args:
        text (str): Input text to process
        
    Returns:
        list: List of all possible noun phrases, including sub-phrases
        
    Example:
        >>> text = "The big brown dog"
        >>> extract_noun_phrases(text)
        ['big brown dog', 'brown dog', 'dog']
    """
    
    # Process the text
    doc = nlp(text.lower())
    
    noun_phrases = []
    
    for i, token in enumerate(doc):
        # If we find a noun, look backwards for adjectives and nouns
        if token.pos_ in ['NOUN', 'PROPN']:
            # Generate all possible phrases ending with this noun
            phrase_words = []
            j = i
            while j >= 0 and doc[j].pos_ in ['ADJ', 'NOUN', 'PROPN']:
                phrase_words.insert(0, doc[j].text)
                if len(phrase_words) > 1:  # Only add phrases with 2+ words
                    noun_phrases.append(' '.join(phrase_words))
                j -= 1
            # Add the single noun itself
            noun_phrases.append(token.text)
    
    # Remove duplicates while preserving order
    seen = set()
    noun_phrases = set(x for x in noun_phrases if not (x in seen or seen.add(x)))
    
    return noun_phrases

In [38]:
np_d = {}
for node, text in enumerate([x[0] for x in text_l]):
    np_d[node] = extract_noun_phrases(text.replace("[SEP]", ". "))

In [39]:
all_np = collections.Counter([np for np_s in np_d.values() for np in np_s])

In [40]:
cluster_np_d = {}
for cluster, node_s in merging_data['jclu_d'].items():
    cluster_np_d[cluster] = collections.Counter([np for np_s in [np_d[node] for node in merging_data['jclu_d'][cluster]] for np in np_s])

In [None]:
m = 2 # Smoothing parameter
len_all = len(text_l)
cluster_np_score_d = {}
for cluster, np_count in cluster_np_d.items():
    clu_len = len(merging_data['jclu_d'][cluster])
    cluster_np_score_d[cluster] = {}
    for np, count in np_count.items():
        local_frecuency = count/clu_len
        global_frecuency = all_np[np]/len_all
        ss_global_frecuency = min(all_np[np]+m, len_all)/len_all
        cluster_np_score_d[cluster][np] = (local_frecuency, local_frecuency/global_frecuency, local_frecuency/ss_global_frecuency)

In [42]:
headers = 'id\tlabel\tdescription\tx\ty\tcluster\tweight<Size>\n'
for i, local_cluster_np_d in cluster_np_score_d.items():
    index = i
    label = sorted(list(local_cluster_np_d.items()), key=lambda x: -x[1][2])[0][0]
    description = '<table><tr><td>Over-rep.</td><td>Cover</td><td>Label</td></tr>'
    for row in sorted(list(local_cluster_np_d.items()), key=lambda x: -x[1][2])[:5]:
         description += f"<tr><td>{round(row[1][1])}</td><td>{format(row[1][0], '.2f')}</td><td>{row[0]}</td></tr>"
    description += '</table>'
    x = i
    y = i
    cluster = 1
    size = len(clu_d[i])
    headers += f"{index}\t{label}\t{description}\t{x}\t{y}\t{cluster}\t{size}\n"
with open('map.txt', 'w') as f:
    f.write(headers)

In [43]:
net = ''
for x, y_d in merging_data['jcon_d'].items():
    for y, count in y_d.items():
        net += f"{x}\t{y}\t{count}\n"
with open('net.txt', 'w') as f:
    f.write(net)

In [44]:
cluster_book_d = {}
for cluster, local_cluster_np_d in cluster_np_score_d.items():
    label = sorted(list(local_cluster_np_d.items()), key=lambda x: -x[1][2])[0][0]
    cluster_book_d[label] = [{'title': text_l[node][1][2], 'id': text_l[node][1][1]} for node in merging_data['jclu_d'][cluster]]
with open("cluster_book_d.json", "w") as json_file:
    json.dump(cluster_book_d, json_file, indent=4)