In [None]:
import os
import warnings
import pandas as pd

from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
from stop_words import get_stop_words
import spacy
from nltk import FreqDist

os.chdir("../")
warnings.filterwarnings("ignore")

from src.modeling import BERTopic_
from src.config import (
    umap_data,
    hdbscan_data,
    sent_transformers_data,
    tfidf_data,
    tokenizer_data,
    mmr_data,
    bertopic_data,
)
from src.data_preprocess import Preprocessing
from src.utils import (
    getClusteringModel,
    getDimReductionModel,
    getMaximalMarginalRelevance,
    getTfidfTransformers,
    getTokenizer,
    getEmbeddings,
    getFrequencyDictForText,
    create_wordcloud,
    global_wordcloud,
    context_stopword
)

# Metadata

In [None]:
sample_file_path =  "./data/chatbot_data_file_sample.csv"
with open(sample_file_path, 'rb') as f:
    df_docs = pd.read_csv(f, sep="|", encoding="utf-8")

In [None]:
df_docs.head()

# Custom Model Test

In [None]:
list_context_sw = [
    "ca",
    "ok",
    "dj",
    "quil",
    "tjrs",
    "tjr",
    "aussitt",
    "bonjour",
    "bnjr",
    "bjr",
    "bsr",
    "bonsoir",
    "ner",
    "jer",
    "nest",
    "déjà",
    "jen",
    "salam",
    "bcp",
    "cordiale",
    "cordialement",
    "quelqu",
    "club",
    "total",
    "energie",
    "énergie",
    "totalenergie",
    "question",
    "jai",
    "aije",
    "narrive",
    "nai",
    "savoir",
    "estce",
    "sontils",
    ",",
    "essqu",
    "cava",
    "cest",
    "mexpliquer",
    "expliquer",
    "devoir",
    "pouvoir",
    "valider",
    "vouloir",
    "arriver",
    "offrir",
    "perdre",
    "souhaiter",
    "fonctionner",
    "faire",
    "utiliser",
    "souscrire",
    "voir",
    "venir",
    "reformuler",
    "recevoir"
]

language = "french"
spacy_model = 'fr_core_news_md'
transformer = "dangvantuan/sentence-camembert-large"
preprocessor = Preprocessing(spacy_model, language, list_context_sw)

docs_name = "chatbot-sample"

In [None]:
docs = df_docs["question"].apply(preprocessor.pipeline).tolist()

In [None]:
df_docs["question"].apply(preprocessor.pipeline)

In [None]:
getFrequencyDictForText(" ".join(docs), language, list_context_sw)

In [None]:
global_wordcloud(" ".join(docs), language, list_context_sw)

In [None]:
# transformer_ = "all-MiniLM-L6-v2"
# docs_name = "fetch-sample"
# language = "english"

# docs = fetch_20newsgroups(
#    subset="all",
#    remove=("headers", "footers", "quotes")
#    )["data"]

In [None]:
umap_model = getDimReductionModel(umap_data())
hdbscan_model = getClusteringModel(hdbscan_data())
vectorizer_model = getTokenizer(tokenizer_data(language=language), list_context_sw)
ctfidf_model = getTfidfTransformers(tfidf_data())
mmr_model = getMaximalMarginalRelevance(mmr_data())

In [None]:
bertopic_config = bertopic_data(
    umap_model, hdbscan_model, vectorizer_model, ctfidf_model, mmr_model, nr_topics=15
)

In [None]:
bert_topic_inst = BERTopic_(bertopic_config)

In [None]:
bert_topic_inst.fit_or_load(transformer, docs_name, docs)

In [None]:
bert_topic_inst.visual_inference()

In [None]:
create_wordcloud(bert_topic_inst.model, 0)

In [None]:
bert_topic_inst.tabular_inference(docs)[0]