In [1]:
import os
import warnings
import pandas as pd

from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
from stop_words import get_stop_words
import spacy
from nltk import FreqDist

os.chdir("../")
warnings.filterwarnings("ignore")

from src.modeling import BERTopic_
from src.config import (
    umap_data,
    hdbscan_data,
    sent_transformers_data,
    tfidf_data,
    tokenizer_data,
    mmr_data,
    bertopic_data,
)
from src.data_preprocess import Preprocessing
from src.utils import (
    getClusteringModel,
    getDimReductionModel,
    getMaximalMarginalRelevance,
    getTfidfTransformers,
    getTokenizer,
    getEmbeddings,
    getFrequencyDictForText,
    create_wordcloud,
    global_wordcloud,
    context_stopword
)

# Metadata

In [2]:
sample_file_path =  "./data/chatbot_data_file_sample.csv"
with open(sample_file_path, 'rb') as f:
    df_docs = pd.read_csv(f, sep="|", encoding="utf-8")

In [3]:
df_docs.head()

Unnamed: 0,date,question
0,2022-08-12,mon numero de carte club n'est pas le bon
1,2022-12-26,aucune de ces questions.
2,2023-03-27,jeu24 heure du mans
3,2023-01-29,ma carte ne cumul pas mes passage
4,2022-11-12,mot de passe réinitialisé mais toujours pas d'...


In [4]:
text = " ".join(df_docs["question"].tolist())
words = text.split()
fdist1 = FreqDist(words)
print(fdist1.most_common())

[('de', 874), ('je', 740), ('carte', 545), ('pas', 483), ('mon', 385), ('ma', 362), ('la', 309), ('le', 286), ('ne', 260), ('à', 237), ('que', 235), ('total', 228), ('?', 218), ('comment', 218), ('et', 198), ('est', 196), ("j'ai", 181), ('pour', 176), ('une', 172), ('sur', 168), ('a', 156), ('en', 152), ('club', 149), ('un', 149), ('les', 135), ('compte', 134), ('me', 123), ('mais', 107), ('passe', 101), ('pourquoi', 98), ('mes', 95), ('points', 94), ('il', 93), ('mot', 90), ('faire', 88), ('vous', 88), ('des', 81), ('plus', 80), ('au', 78), ('avec', 75), ('ai', 74), ('suis', 73), ('dans', 73), ('du', 71), ('j’ai', 68), ('mail', 67), ('ce', 64), ("n'ai", 61), ('carburant', 60), ('j', 59), ('avoir', 57), ('euros', 56), ('fait', 56), ('peux', 53), ('on', 52), ('reçu', 52), ('aucune', 50), ('cagnotte', 50), ('savoir', 50), ('si', 49), ('autre', 49), ('fonctionne', 49), ('merci', 48), ('station', 48), ('40', 47), ('ces', 45), ('toujours', 45), ('plein', 45), ('rien', 45), ('ou', 44), ('que

In [5]:
import spacy
from spacy_cld import LanguageDetector

nlp = spacy.load('en')
language_detector = LanguageDetector()
nlp.add_pipe(language_detector)

tweets          = df['tweets']
languages_spacy = []

for e in tweets:
    doc = nlp(e)
    # cheking if the doc._.languages is not empty
    # then appending the first detected language in a list
    if(doc._.languages):
        languages_spacy.append(doc._.languages[0])
    # if it is empty, we append the list by unknown
    else:
        languages_spacy.append('unknown')

ModuleNotFoundError: No module named 'spacy_cld'

# Custom Model Test

In [None]:
list_context_sw = [
    "ca",
    "ok",
    "dj",
    "quil",
    "tjrs",
    "tjr",
    "aussitt",
    "bnjr",
    "ner",
    "jer",
    "nest",
    "déjà",
    "jen",
    "salam",
    "bcp",
    "cordiale",
    "cordialement",
    "quelqu",
    "club",
    "total",
    "energie",
    "énergie",
    "totalenergie",
    "question",
    "jai",
    "aije",
    "narrive",
    "nai",
    "savoir",
    "estce",
    "sontils",
    ",",
    "essqu",
    "cava",
    "mexpliquer",
    "expliquer",
    "devoir",
    "pouvoir",
    "valider",
    "vouloir",
    "offrir",
    "perdre",
    "souhaiter",
    "fonctionner",
    "faire",
    "utiliser",
    "souscrire",
    "voir",
    "venir",
    "reformuler",
    "recevoir"
]

language = "french"
spacy_model = 'fr_core_news_md'
transformer = "dangvantuan/sentence-camembert-large"
preprocessor = Preprocessing(spacy_model, language, list_context_sw)

docs_name = "chatbot-sample"

In [None]:
docs = df_docs["question"].apply(preprocessor.pipeline).tolist()

In [None]:
df_docs["question"].apply(preprocessor.pipeline)

In [None]:
getFrequencyDictForText(" ".join(docs), language, list_context_sw)

In [None]:
global_wordcloud(" ".join(docs), language, list_context_sw)

In [None]:
# transformer_ = "all-MiniLM-L6-v2"
# docs_name = "fetch-sample"
# language = "english"

# docs = fetch_20newsgroups(
#    subset="all",
#    remove=("headers", "footers", "quotes")
#    )["data"]

In [None]:
umap_model = getDimReductionModel(umap_data())
hdbscan_model = getClusteringModel(hdbscan_data())
vectorizer_model = getTokenizer(tokenizer_data(language=language), list_context_sw)
ctfidf_model = getTfidfTransformers(tfidf_data())
mmr_model = getMaximalMarginalRelevance(mmr_data())

In [None]:
bertopic_config = bertopic_data(
    umap_model, hdbscan_model, vectorizer_model, ctfidf_model, mmr_model, nr_topics="auto"
)

In [None]:
bert_topic_inst = BERTopic_(bertopic_config)

In [None]:
bert_topic_inst.fit_or_load(transformer, docs_name, docs)

In [None]:
bert_topic_inst.visual_inference()

In [None]:
create_wordcloud(bert_topic_inst.model, 0)

In [None]:
bert_topic_inst.tabular_inference(docs)[0]