In [1]:
import random
import pickle
from sklearn.feature_extraction.text import CountVectorizer

from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim import corpora

from gensim.models.fasttext import load_facebook_vectors
import numpy as np
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO
import IPython.display as ipd
from octis.dataset.dataset import Dataset
from octis.models.ProdLDA import ProdLDA
from octis.models.NeuralLDA import NeuralLDA

import csv
from collections import Counter
import os
import gc
import time
import warnings
warnings.filterwarnings(action='ignore')

## Data Prep

In [2]:
with open('./pickled/banfake_tokenized.pickle', 'rb') as f:
    docs = pickle.load(f)

random.shuffle(docs)

In [3]:
out_folder = './data/banfake'
sanitized_docs = []
for doc in docs:
    sanitized_doc = [token.replace('\t', ' ') for token in doc]
    sanitized_docs.append(sanitized_doc)

with open(f"{out_folder}/corpus.tsv", "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    for doc in sanitized_docs:
        doc_text = " ".join(doc)
        if doc_text.strip():  
            writer.writerow([doc_text, "train"])

vocab = sorted({tok for doc in sanitized_docs for tok in doc})
with open(f"{out_folder}/vocabulary.txt", "w", encoding="utf-8") as f:
    for word in vocab:
        f.write(f"{word}\n")

print("Vocabulary size:", len(vocab))

Vocabulary size: 130227


In [4]:
#########################################################################################################################################

In [3]:
def evaluate_topic_words(topic_words, texts):

    dictionary = Dictionary(texts)

    results = {}
    
    if texts and dictionary:
        
        coherence_c_v = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence='c_v').get_coherence()
        coherence_npmi = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence='c_npmi').get_coherence()
        
        results['coherence_c_v'] = coherence_c_v
        results['coherence_npmi'] = coherence_npmi
    
    octis = {
        'topics': topic_words  
    }
    
    td = TopicDiversity(topk=10)
    irbo = InvertedRBO(topk=10)
    
    results['topic_diversity'] = td.score(octis)
    results['IRBO'] = irbo.score(octis)
    
    return results

In [4]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder("./data/banfake")
num_topics = 12

In [7]:
prodlda = ProdLDA(
    num_topics=num_topics,
    use_partitions=False,
    batch_size=8
)

start_time = time.time()
prodlda_output = prodlda.train_model(dataset)
end_time = time.time()

runtime = end_time - start_time
topic_words = prodlda_output['topics']

for idx, topic in enumerate(topic_words):
    print(f"Topic {idx}: {topic}")

MemoryError: Unable to allocate 47.2 GiB for an array with shape (48675, 130227) and data type int64

In [None]:
results = evaluate_topic_words(topic_words, docs)
results['runtime'] = runtime
results

In [None]:
neural_lda = NeuralLDA(
    num_topics=num_topics,
    use_partitions=False,
    batch_size=8
)
start_time = time.time()
nlda_output = neural_lda.train_model(dataset)
end_time = time.time()

runtime = end_time - start_time
topic_words = nlda_output['topics']

for idx, topic in enumerate(topic_words):
    print(f"Topic {idx}: {topic}")

In [None]:
results = evaluate_topic_words(topic_words, docs)
results['runtime'] = runtime
results

In [None]:
duration = 2  
sampling_rate = 44100
frequency = 440.0 
t = np.linspace(0, duration, int(sampling_rate * duration), False)
wave = 0.5 * np.sin(2 * np.pi * frequency * t)

audio = ipd.Audio(wave, rate=sampling_rate, autoplay=True)
display(audio)