In [None]:
import os
import warnings
import pandas as pd
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
from stop_words import get_stop_words
import spacy
from nltk import FreqDist

os.chdir("../")
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth = 200

from src.modeling import BERTopic_
from src.config import (
    umap_data,
    hdbscan_data,
    sent_transformers_data,
    tfidf_data,
    tokenizer_data,
    mmr_data,
    bertopic_data,
)
from src.data_preprocess import Preprocessing
from src.utils import (
    getClusteringModel,
    getDimReductionModel,
    getMaximalMarginalRelevance,
    getTfidfTransformers,
    getTokenizer,
    getEmbeddings,
    getFrequencyDictForText,
    create_wordcloud,
    global_wordcloud,
    context_stopword
)

In [None]:
# !python3 -m spacy download fr_core_news_md
# !python3 -m spacy download en_core_web_sm

In [None]:
# import nltk
# nltk.download('punkt')

# Configs Params

In [None]:
with open(f'./data/context-sw.txt') as f:
    list_context_sw = [line.strip() for line in f.readlines()]
f.close()

language = "french"
spacy_model = 'fr_core_news_md'
transformer = "dangvantuan/sentence-camembert-large"
preprocessor = Preprocessing(spacy_model, language, list_context_sw)

docs_name = "chatbot-sample"

# Metadata Stats

In [None]:
sample_file_path =  "./data/chatbot_data_file_sample.csv"
with open(sample_file_path, 'rb') as f:
    df_docs = pd.read_csv(f, sep="|", encoding="utf-8")

df_docs.head()

In [None]:
df_docs["language"] = df_docs["question"].apply(preprocessor.getLanguage)

# df_docs["max_length"] = df_docs["question"].apply(lambda x: max([len(token) for token in x.split(" ")]))

In [None]:
print(df_docs["language"].unique().tolist())

In [None]:
100*df_docs["language"].value_counts(normalize=True)

In [None]:
# keep docs in french only
df_docs[df_docs["language"] == "fr"]

In [None]:
df_docs = df_docs[df_docs["language"] == 'fr'].reset_index(drop=True)

# Custom Model Test

In [None]:
docs = df_docs["question"].apply(preprocessor.pipeline).tolist()

In [None]:
df_docs["question"].apply(preprocessor.pipeline)

In [None]:
getFrequencyDictForText(" ".join(docs), language, list_context_sw)

In [None]:
global_wordcloud(" ".join(docs), language, list_context_sw)

In [None]:
umap_model = getDimReductionModel(umap_data())
hdbscan_model = getClusteringModel(hdbscan_data())
vectorizer_model = getTokenizer(tokenizer_data(language=language), list_context_sw)
ctfidf_model = getTfidfTransformers(tfidf_data())
mmr_model = getMaximalMarginalRelevance(mmr_data())

In [None]:
bertopic_config = bertopic_data(
    umap_model,
    hdbscan_model,
    vectorizer_model,
    ctfidf_model,
    mmr_model,
    nr_topics="auto"
)

bert_topic_inst = BERTopic_(bertopic_config)

In [None]:
bert_topic_inst.fit_or_load(transformer, docs_name, docs)

In [None]:
bert_topic_inst.visual_inference()

In [None]:
create_wordcloud(bert_topic_inst.model, 0)

In [None]:
bert_topic_inst.tabular_inference(docs)[1][["Document", "Topic", "Name"]]