# Cosine Davies-Bouldin Index (cDBI)

In [1]:
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.matutils import cossim
from sklearn.metrics import pairwise_distances
import numpy as np

# Sample data
documents = [
    ["cat", "say", "meow"], 
    ["dog", "say", "woof"], 
    ["cat", "dog", "say", "hello"],
    ["hello", "world"],
    ["python", "programming"],
    ["hello", "cat"]
]

# Create a dictionary and corpus
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Function to compute the Cosine Davies-Bouldin Index (cDBI)
def compute_cDBI(model, corpus, num_topics, dictionary):
    # Get the topic vectors in a dense format
    topic_vectors = []
    for i in range(num_topics):
        topic_vector = np.zeros(len(dictionary))
        for term_id, weight in model.get_topic_terms(i, topn=len(dictionary)):
            topic_vector[term_id] = weight
        topic_vectors.append(topic_vector)
    
    # Convert topic vectors to a numpy array
    topic_matrix = np.array(topic_vectors)
    
    # Compute the pairwise cosine distances between topic vectors
    dist_matrix = pairwise_distances(topic_matrix, metric='cosine')
    
    # Compute intra-cluster distances
    s_i = np.zeros(num_topics)
    for i in range(num_topics):
        topic_i = topic_matrix[i]
        sims = [1 - dist_matrix[i, j] for j in range(num_topics) if i != j]  # Cosine similarity is 1 - cosine distance
        s_i[i] = np.mean(sims)
    
    # Compute inter-cluster distances
    r_ij = np.zeros((num_topics, num_topics))
    for i in range(num_topics):
        for j in range(num_topics):
            if i != j:
                r_ij[i, j] = (s_i[i] + s_i[j]) / dist_matrix[i, j]
    
    # Compute the Davies-Bouldin index
    dbi = np.mean(np.max(r_ij, axis=1))
    return dbi

# Function to find the optimal number of topics
def find_optimal_topics(dictionary, corpus, start, limit, step):
    dbi_scores = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=10)
        model_list.append(model)
        dbi = compute_cDBI(model, corpus, num_topics, dictionary)
        dbi_scores.append(dbi)
        print(f"Num Topics: {num_topics}, DBI: {dbi}")
    
    # Find the model with the lowest DBI score
    optimal_index = np.argmin(dbi_scores)
    optimal_model = model_list[optimal_index]
    optimal_num_topics = start + optimal_index * step
    
    return optimal_model, optimal_num_topics, dbi_scores

# Parameters
start = 4
limit = 20
step = 1

# Find the optimal number of topics
optimal_model, optimal_num_topics, dbi_scores = find_optimal_topics(dictionary, corpus, start, limit, step)

print(f"The optimal number of topics is {optimal_num_topics}")


Num Topics: 4, DBI: 7.149159352107814
Num Topics: 5, DBI: 4.978887808373736
Num Topics: 6, DBI: 4.118470996113441
Num Topics: 7, DBI: 98752095981.88623
Num Topics: 8, DBI: 548417996503.3175
Num Topics: 9, DBI: 2074578784360.8801
Num Topics: 10, DBI: 47505028626530.19
Num Topics: 11, DBI: 132406534664716.81
Num Topics: 12, DBI: 526860185639126.56


  r_ij[i, j] = (s_i[i] + s_i[j]) / dist_matrix[i, j]
  r_ij[i, j] = (s_i[i] + s_i[j]) / dist_matrix[i, j]


Num Topics: 13, DBI: inf
Num Topics: 14, DBI: inf
Num Topics: 15, DBI: 1.044636158759646e+16
Num Topics: 16, DBI: 1.0801524752239252e+16
Num Topics: 17, DBI: 1.2617582541116538e+16
Num Topics: 18, DBI: 1.0208188537109864e+16
Num Topics: 19, DBI: 1.2421416029592444e+16
The optimal number of topics is 6


: 

In [10]:
import gensim
from gensim import corpora
from gensim.models import LdaModel
import numpy as np
from sklearn.metrics import pairwise_distances
from gensim.models.keyedvectors import KeyedVectors

# Load GloVe vectors
glove_vectors = KeyedVectors.load_word2vec_format(r'C:\dev\NLP2RE_Sandbox\data\glove.6B\glove.6B.100d.txt', binary=False, no_header=True)

# Sample data
documents = [
    ["cat", "say", "meow"], 
    ["dog", "say", "woof"], 
    ["cat", "dog", "say", "hello"],
    ["hello", "world"],
    ["python", "programming"],
    ["hello", "cat"]
]

# Create a dictionary and corpus
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]

def get_glove_vector(word, glove_vectors):
    try:
        return glove_vectors[word]
    except KeyError:
        return np.zeros(glove_vectors.vector_size)

def compute_cDBI(model, corpus, num_topics, dictionary, glove_vectors):
    topic_vectors = []
    for i in range(num_topics):
        words = model.show_topic(i, topn=len(dictionary))
        topic_vector = np.zeros(glove_vectors.vector_size)
        for word, weight in words:
            topic_vector += weight * get_glove_vector(word, glove_vectors)
        topic_vectors.append(topic_vector)
    
    topic_matrix = np.array(topic_vectors)
    dist_matrix = pairwise_distances(topic_matrix, metric='cosine')

    s_i = np.zeros(num_topics)
    for i in range(num_topics):
        topic_i = topic_matrix[i]
        sims = [1 - dist_matrix[i, j] for j in range(num_topics) if i != j]
        s_i[i] = np.mean(sims)

    r_ij = np.zeros((num_topics, num_topics))
    for i in range(num_topics):
        for j in range(num_topics):
            if i != j:
                r_ij[i, j] = (s_i[i] + s_i[j]) / dist_matrix[i, j]

    dbi = np.mean(np.max(r_ij, axis=1))
    return dbi

def find_optimal_topics(dictionary, corpus, start, limit, step, glove_vectors):
    dbi_scores = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=10)
        model_list.append(model)
        dbi = compute_cDBI(model, corpus, num_topics, dictionary, glove_vectors)
        dbi_scores.append(dbi)
        print(f"Num Topics: {num_topics}, DBI: {dbi}")
    
    optimal_index = np.argmin(dbi_scores)
    optimal_model = model_list[optimal_index]
    optimal_num_topics = start + optimal_index * step
    
    return optimal_model, optimal_num_topics, dbi_scores

start = 4
limit = 20
step = 1

optimal_model, optimal_num_topics, dbi_scores = find_optimal_topics(dictionary, corpus, start, limit, step, glove_vectors)

print(f"The optimal number of topics is {optimal_num_topics}")


Num Topics: 4, DBI: 34.90387089404365
Num Topics: 5, DBI: 3393550061.581806
Num Topics: 6, DBI: 61226950769.09612
Num Topics: 7, DBI: 703670111013.2266
Num Topics: 8, DBI: 2650302116961.358
Num Topics: 9, DBI: 8252209874335.478
Num Topics: 10, DBI: 82372548298727.42
Num Topics: 11, DBI: 821912823956746.5
Num Topics: 12, DBI: 2473974643906416.5
Num Topics: 13, DBI: inf
Num Topics: 14, DBI: inf
Num Topics: 15, DBI: 6284905830643607.0
Num Topics: 16, DBI: 1.2957511201452448e+16


  r_ij[i, j] = (s_i[i] + s_i[j]) / dist_matrix[i, j]
  r_ij[i, j] = (s_i[i] + s_i[j]) / dist_matrix[i, j]


Num Topics: 17, DBI: 1.0110709905874016e+16
Num Topics: 18, DBI: 8756611477599767.0
Num Topics: 19, DBI: 1.131077714864021e+16
The optimal number of topics is 4
