In [1]:
import openml

In [2]:
datasets = openml.datasets.list_datasets()

In [3]:
openml.datasets.list_datasets()[531]

In [4]:
# dataset_ids is ordered_dict, map to variable with only ids
ids = list(datasets.keys())
len(ids)

In [5]:
# remove 4537, 4546, 4562, because it is not a dataset
ids.remove(4537)
ids.remove(4546)
ids.remove(4562)

In [6]:
datasets = openml.datasets.get_datasets(ids, download_data=False, download_qualities=False)
len(datasets)

In [7]:
# get all datasets with non-empty description
datasets = [dataset for dataset in datasets if dataset.description]
len(datasets)

In [8]:
# remove datasets with description length < 100
datasets = [dataset for dataset in datasets if len(dataset.description) >= 100]
len(datasets)

In [9]:
datasets[0]

In [10]:
data = [dataset.description for dataset in datasets]

# remove all datasets with identical descriptions
data = list(set(data))

len(data)

In [2]:
import matplotlib
import matplotlib.pyplot as plt
# matplotlib.use("pgf")
# # matplotlib.rcParams.update({
#     # "pgf.texsystem": "pdflatex",
#     'font.family': 'serif',
#     # 'text.usetex': True,
#     # 'pgf.rcfonts': False,
# })

plt.hist([len(d) for d in data], bins=100, color='C0')
plt.xlabel('Length of description')
plt.ylabel('Number of datasets')
# set figure size to smaller
plt.gcf().set_size_inches(4.65, 3)

plt.subplots_adjust(left=0.15, bottom=0.15, right=0.95, top=0.95)
# adjust just bottom, not top left or right
# plt.subplots_adjust(bottom=0.15)

# plt.show()
# save fig as pdf
# plt.savefig('description_length_histogram.pdf')

# import tikzplotlib
# tikzplotlib.save("description_length_histogram.tex")

In [None]:
import torch
print(torch.get_num_threads())
torch.set_num_threads(1)
print(torch.get_num_threads())

In [None]:
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = sentence_model.encode(data, show_progress_bar=True)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

# we add this to remove stopwords
# vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
# vectorizer_model = CountVectorizer(stop_words="english")
# model = BERTopic(vectorizer_model=vectorizer_model,
#                  language="english",
#                  calculate_probabilities=True,
#                  verbose=True)
representation_model = KeyBERTInspired()
# representation_model = MaximalMarginalRelevance(diversity=0.3)

topic_model = BERTopic(
    verbose=True,
    nr_topics=50,
    calculate_probabilities=True,
    embedding_model=sentence_model,
    # vectorizer_model=vectorizer_model,
    representation_model=representation_model
)
topics, probs = topic_model.fit_transform(data, embeddings)

In [None]:
# from bertopic import BERTopic
# from ctransformers import AutoModelForCausalLM
# from transformers import AutoTokenizer, pipeline
# 
# # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
# mistral_model = AutoModelForCausalLM.from_pretrained(
#     "TheBloke/zephyr-7B-alpha-GGUF",
#     model_file="zephyr-7b-alpha.Q4_K_M.gguf",
#     model_type="mistral",
#     gpu_layers=50,
#     hf=True
# )
# tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")
# 
# # Pipeline
# generator = pipeline(
#     model=mistral_model, tokenizer=tokenizer,
#     task='text-generation',
#     max_new_tokens=50,
#     repetition_penalty=1.1
# )
# 
# prompt = """<|system|>You are a helpful, respectful and honest assistant for labeling topics..</s>
# <|user|>
# I have a topic that contains the following documents:
# [DOCUMENTS]
# 
# The topic is described by the following keywords: '[KEYWORDS]'.
# 
# Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.</s>
# <|assistant|>"""
# 
# from bertopic.representation import TextGeneration
# 
# # Text generation with Zephyr
# zephyr = TextGeneration(generator, prompt=prompt)
# representation_model = {"Zephyr": zephyr}
# 
# # Topic Modeling
# model = BERTopic(representation_model=representation_model, verbose=True)
# topics, probs = model.fit_transform(dataset_descriptions, embeddings)

In [None]:
# from transformers import pipeline
# from bertopic.representation import TextGeneration
# 
# prompt = "I have a topic described by the following keywords: [KEYWORDS]. Based on the previous keywords, what is this topic about?"
# 
# # Create your representation model
# generator = pipeline('text2text-generation', model='google/flan-t5-base')
# representation_model = TextGeneration(generator)
# model = BERTopic(representation_model=representation_model, verbose=True)
# topics, probs = model.fit_transform(dataset_descriptions, embeddings)


In [None]:
for i in range(5):
    print(f"{topics[i]}: {len(data[i])}")

In [None]:
topic_model.get_document_info(data)

In [None]:
topic_model.generate_topic_labels()

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.topic_sizes_

In [None]:
topic_model.visualize_documents(data)

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy()

In [None]:
topic_model.visualize_barchart(top_n_topics = 16, n_words=10)

In [1]:
# plotly = topic_model.visualize_barchart(top_n_topics = 16, n_words=10)
# now visualize only topic 0
topic_model.set_topic_labels({2: "Topic 1", 3: "Topic 2", 4: "Topic 3", 7: "Topic 4", 9: "Topic 5", 11: "Topic 6", 12: "Topic 7", 13: "Topic 8", 14: "Topic 9", 15: "Topic 10", 0: "Topic 11", 1: "Topic 12"})
plotly = topic_model.visualize_barchart(topics=[2, 3, 4, 7, 9, 11, 12, 13, 14, 15, 0, 1], n_words=10, title="", custom_labels=True, height=270)
plotly.show()
plotly.write_image("topics_barchart.pdf")

In [None]:
topic_model.get_topic(2)

In [None]:
print(topic_model.get_representative_docs(12)[0])

In [None]:
# get the pandas dataframe
# topic_model.get_document_info(data)
# print only where column Topic is -1
topic_model.get_document_info(data)[topic_model.get_document_info(data)["Topic"] == 12]