In [1]:
#!pip install embedded-topic-model

In [2]:
import random
import pickle
from sklearn.feature_extraction.text import CountVectorizer

from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim import corpora
from embedded_topic_model.models.etm import ETM
from gensim.models.fasttext import load_facebook_vectors
import numpy as np
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO
import IPython.display as ipd

from collections import Counter
import os
import gc
import time
import warnings
warnings.filterwarnings(action='ignore')

## Data Prep

In [3]:
with open('./pickled/banfake_tokenized.pickle', 'rb') as f:
    docs = pickle.load(f)

num_topics = 12
random.shuffle(docs)

In [4]:
#########################################################################################################################################

In [5]:
def evaluate_topic_words(topic_words, texts):
    
    dictionary = Dictionary(texts)

    results = {}
    
    if texts and dictionary:
        
        coherence_c_v = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence='c_v').get_coherence()
        coherence_npmi = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence='c_npmi').get_coherence()
        
        results['coherence_c_v'] = coherence_c_v
        results['coherence_npmi'] = coherence_npmi
    
    octis = {
        'topics': topic_words  
    }
    
    td = TopicDiversity(topk=10)
    irbo = InvertedRBO(topk=10)
    
    results['topic_diversity'] = td.score(octis)
    results['IRBO'] = irbo.score(octis)
    
    return results

## ETM

In [6]:
dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=3, no_above=0.8)

vocab = [dictionary[id] for id in range(len(dictionary))]
corpus = [dictionary.doc2bow(doc) for doc in docs]

tokens_list = [[token_id for token_id, count in doc] for doc in corpus]
counts_list = [[count for token_id, count in doc] for doc in corpus]

tokens_np = np.array([np.array(doc, dtype=np.int64) for doc in tokens_list], dtype=object)  
counts_np = np.array([np.array(cnt, dtype=np.int64) for cnt in counts_list], dtype=object)  

train_data = {  
    "tokens": tokens_np,  
    "counts": counts_np  
}  

In [None]:
ft = load_facebook_vectors("/home/farhana/bnlp/models/fasttext_cc.bn.300.bin")

In [None]:
start_time = time.time()

etm = ETM(vocab, ft, num_topics=num_topics)
etm.fit(train_data)

end_time = time.time()
runtime = end_time - start_time

topic_words = etm.get_topics()
topic_words

In [None]:
results = evaluate_topic_words(topic_words, docs)
results['runtime'] = runtime
results

In [None]:
duration = 2  
sampling_rate = 44100
frequency = 440.0 
t = np.linspace(0, duration, int(sampling_rate * duration), False)
wave = 0.5 * np.sin(2 * np.pi * frequency * t)

audio = ipd.Audio(wave, rate=sampling_rate, autoplay=True)
display(audio)