In [None]:
import os
import pandas as pd
from bertopic import BERTopic
from datetime import datetime, date
from nltk import FreqDist
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.datasets import fetch_20newsgroups
from stop_words import get_stop_words
import spacy
import warnings

os.chdir("../")
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth = 200

from src.modeling import BERTopic_
from src.config import (
    umap_data,
    hdbscan_data,
    sent_transformers_data,
    tfidf_data,
    tokenizer_data,
    mmr_data,
    bertopic_data,
)
from src.data_preprocess import Preprocessing
from src.utils import (
    getClusteringModel,
    getDimReductionModel,
    getMaximalMarginalRelevance,
    getTfidfTransformers,
    getTokenizer,
    getEmbeddings,
    getFrequencyDictForText,
    plot_wordcloud,
    global_wordcloud,
    context_stopwords
)

# Configs Params

In [None]:
with open(f'./data/context-sw.txt') as f:
    list_context_sw = [line.strip() for line in f.readlines()]
f.close()

language = "french"
spacy_model = 'fr_core_news_md'
transformer = "dangvantuan/sentence-camembert-large"
preprocessor = Preprocessing(spacy_model, language, list_context_sw)

docs_name = "chatbot"

In [None]:
#for token in sorted(list_context_sw):
#     print(token)

# Metadata Stats

In [None]:
sample_file_path =  "./data/chatbot_data_file.csv"

with open(sample_file_path, 'rb') as f:
    df_docs = pd.read_csv(f, sep="|", encoding="utf-8")

df_docs["date_day"] = df_docs["date"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").strftime("%A"))
df_docs["language"] = df_docs["question"].apply(preprocessor.getLanguage).apply(lambda x: x if x in ["fr", "en"] else 'other_lang')
df_docs["length"] = df_docs["question"].apply(lambda x: len(x.split(" ")))

In [None]:
df_docs.head()

In [None]:
df_lang = df_docs[["language", "question"]].groupby("language").count().reset_index()
df_lang

In [None]:
fig = px.pie(df_lang, values='question', names='language', title='Represented languages in docs', width=500)
fig.show()

In [None]:
fig = px.histogram(df_docs, x="length", width=800, height=500, labels={"length": "Question Length"}, histnorm='probability density')
fig.show()

# Custom Model Test

In [None]:
# keep docs in language = "french" only
df_docs["clean_question"] = df_docs["question"].apply(preprocessor.pipeline)
df_docs["empty_clean_question"] = df_docs["clean_question"].apply(lambda x: len(x) == 0)
df_docs = df_docs.query("language == 'fr' and empty_clean_question == False").reset_index(drop=True)
df_docs.head()

In [None]:
raw_docs = df_docs["question"].tolist()

In [None]:
docs = df_docs["clean_question"].tolist()

In [None]:
getFrequencyDictForText(" ".join(docs), language, list_context_sw)

In [None]:
global_wordcloud(" ".join(docs), language, list_context_sw)

In [None]:
umap_model = getDimReductionModel(umap_data())
hdbscan_model = getClusteringModel(hdbscan_data())
vectorizer_model = getTokenizer(tokenizer_data(language=language), list_context_sw)
ctfidf_model = getTfidfTransformers(tfidf_data())
mmr_model = getMaximalMarginalRelevance(mmr_data())

In [None]:
bertopic_config = bertopic_data(
    umap_model,
    hdbscan_model,
    vectorizer_model,
    ctfidf_model,
    mmr_model,
    nr_topics="auto"
)

bert_topic_inst = BERTopic_(bertopic_config)

In [None]:
bert_topic_inst.fit_or_load(transformer, docs_name, docs)

In [None]:
bert_topic_inst.intertopic_()

In [None]:
bert_topic_inst.reduce_topics_(docs, 40)

In [None]:
bert_topic_inst.heatmap_()

In [None]:
bert_topic_inst.barchart_()

In [None]:
df_doc_representative = bert_topic_inst.representative_docs(docs, raw_docs)

In [None]:
topic_id_ = 16

In [None]:
df_doc_representative.query(f"topic_id == {topic_id_} and representative_doc == True")

In [None]:
bert_topic_inst.topic_infeence(docs, raw_docs, topic_id_)