In [None]:
import os
import warnings
import pandas as pd

from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
from stop_words import get_stop_words

os.chdir("../")
warnings.filterwarnings("ignore")

from src.modeling import BERTopic_
from src.config import (
    umap_data,
    hdbscan_data,
    sent_transformers_data,
    tfidf_data,
    tokenizer_data,
    mmr_data,
    bertopic_data,
)
from src.utils import (
    getClusteringModel,
    getDimReductionModel,
    getMaximalMarginalRelevance,
    getTfidfTransformers,
    getTokenizer,
    getEmbeddings,
    getFrequencyDictForText,
    create_wordcloud,
    global_wordcloud
)

In [None]:
# getEmbeddings(sent_transformers_params(), "test", "je suis un scientifique des données")

# Custom Model Test

In [None]:
transformer_ = "dangvantuan/sentence-camembert-base"
docs_name = "chatbot-sample"
language = "french"
list_context_sw = ["\?", "question"]

sample_file_path =  "./data/chatbot_data_file_sample.csv"
with open(sample_file_path, 'rb') as f:
    docs = pd.read_csv(f, sep="|")["question"].tolist()

In [None]:
# transformer_ = "all-MiniLM-L6-v2"
# docs_name = "fetch-sample"
# language = "english"

# docs = fetch_20newsgroups(
#    subset="all",
#    remove=("headers", "footers", "quotes")
#    )["data"]

In [None]:
global_wordcloud(" ".join(docs), language, list_context_sw)

In [None]:
umap_model = getDimReductionModel(umap_data())
hdbscan_model = getClusteringModel(hdbscan_data())
vectorizer_model = getTokenizer(tokenizer_data(language="french"), list_context_sw)
ctfidf_model = getTfidfTransformers(tfidf_data())
mmr_model = getMaximalMarginalRelevance(mmr_data())

In [None]:
bertopic_config = bertopic_data(
    umap_model, hdbscan_model, vectorizer_model, ctfidf_model, mmr_model
)

In [None]:
bert_topic_inst = BERTopic_(bertopic_config)

In [None]:
bert_topic_inst.fit_or_load(transformer_, docs_name, docs)

In [None]:
bert_topic_inst.visual_inference()

In [None]:
create_wordcloud(bert_topic_inst.model, 0)

In [None]:
bert_topic_inst.tabular_inference(docs)[0]