In [1]:
import os
import pandas as pd
from bertopic import BERTopic
from datetime import datetime, date
from nltk import FreqDist
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.datasets import fetch_20newsgroups
from stop_words import get_stop_words
import spacy
import torch
import warnings

os.chdir("../")
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth = 200

from src.modeling import _BERTopic
from src.config import (
    umap_data,
    hdbscan_data,
    sent_transformers_data,
    tfidf_data,
    tokenizer_data,
    mmr_data,
    bertopic_data,
)
from src.data_preprocess import Preprocessing
from src.utils import (
    getClusteringModel,
    getDimReductionModel,
    getMaximalMarginalRelevance,
    getTfidfTransformers,
    getTokenizer,
    getEmbeddings,
    getFrequencyDictForText,
    plot_wordcloud,
    global_wordcloud,
    context_stopwords
)

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


# Configs Params

In [2]:
with open(f'./data/test-context-stopwords.txt') as f:
    list_context_sw = [line.strip() for line in f.readlines()]
f.close()

language = "french"
spacy_model = 'fr_core_news_md'
transformer = "dangvantuan/sentence-camembert-large"
use_preprocessing = False
preprocessor = Preprocessing(spacy_model, language, list_context_sw, use_preprocessing)

docs_name = "tests"

In [3]:
torch.cuda.is_available()

False

In [5]:
torch.cuda.device_count()

0

# Metadata Stats

In [None]:
sample_file_path =  "./data/sample.csv"

with open(sample_file_path, 'rb') as f:
    df_docs = pd.read_csv(f, sep="|", encoding="utf-8")

df_docs["date_day"] = df_docs["date"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").strftime("%A"))
df_docs["language"] = df_docs["question"].apply(preprocessor.getLanguage).apply(lambda x: x if x in ["fr", "en"] else 'other_lang')
df_docs["length"] = df_docs["question"].apply(lambda x: len(x.split(" ")))

In [None]:
df_docs.head()

In [None]:
df_lang = df_docs[["language", "question"]].groupby("language").count().reset_index()
df_lang

In [None]:
fig = px.pie(
    df_lang,
    values='question',
    names='language',
    title='Represented languages in docs',
    width=500
)
fig.show()

In [None]:
fig = px.histogram(
    df_docs,
    x="length",
    width=800,
    height=500,
    labels={"length": "Question Length"},
    histnorm='probability density'
)
fig.show()

# Custom Model Test

In [None]:
# keep docs in language = "french" only
df_docs["clean_question"] = df_docs["question"].apply(preprocessor.pipeline)
df_docs["empty_clean_question"] = df_docs["clean_question"].apply(lambda x: len(x) == 0)
df_docs = df_docs.query("language == 'fr' and empty_clean_question == False").reset_index(drop=True)
df_docs.head()

In [None]:
raw_docs = df_docs["question"].tolist()

docs = df_docs["clean_question"].tolist()docs = df_docs["clean_question"].tolist()

In [None]:
global_wordcloud(" ".join(docs), language, list_context_sw)

# Inference 

In [None]:
topic_id_ = 0