In [1]:
!pip install contextualized_topic_models swifter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contextualized_topic_models
  Downloading contextualized_topic_models-2.5.0-py2.py3-none-any.whl (36 kB)
Collecting swifter
  Downloading swifter-1.3.4.tar.gz (830 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m830.9/830.9 KB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ipython==8.10.0
  Downloading ipython-8.10.0-py3-none-any.whl (784 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m784.3/784.3 KB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
Collecting ipywidgets==7.5.1
  Downloading ipywidgets-7.5.1-py2.py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 KB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentence-transformers>=2.1.1
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from nltk.tokenize import word_tokenize
import swifter

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = list(set(stopwords.words('english')))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[{}0-9]'.format(string.punctuation), ' ', text)
    text=re.sub(r'[^A-Za-z0-9 ]+', ' ', text)
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    text = [WordNetLemmatizer().lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
df=pd.DataFrame({"content":newsgroups["data"]})

df=df.sample(1000)
df["content"]=df["content"].swifter.apply(lambda x: preprocess_text(x))
df['content_length'] = df['content'].str.len()

df = df[df['content_length'] > 100]
df = df[df['content_length'] < 2000]

df=df[["content"]].reset_index(drop=True).reset_index().rename(columns={"index":"id"})
documents=df.content.to_list()

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

In [4]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords
from nltk.corpus import stopwords as stop_words

tp = TopicModelDataPreparation("all-mpnet-base-v2")
num_topics=5
num_words=10

sp = WhiteSpacePreprocessingStopwords(documents, stopwords_list=list(stop_words.words("english")))
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()
ctm = CombinedTM(bow_size=len(vocab), contextual_size=768, n_components=num_topics, num_epochs=num_words)
training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)
ctm.fit(training_dataset)

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]



Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch: [10/10]	 Seen Samples: [7680/7720]	Train Loss: 377.96131642659503	Time: 0:00:00.321733: : 10it [00:03,  3.03it/s]
100%|██████████| 13/13 [00:00<00:00, 40.69it/s]


In [5]:
ctm.get_topic_lists() 

  and should_run_async(code)


[['one',
  'child',
  'law',
  'truth',
  'man',
  'religious',
  'god',
  'may',
  'christ',
  'think'],
 ['word',
  'people',
  'sabbath',
  'man',
  'christ',
  'ed',
  'catholic',
  'jewish',
  'regard',
  'jew'],
 ['sale',
  'microsoft',
  'thanks',
  'hitting',
  'used',
  'speed',
  'several',
  'nec',
  'manual',
  'disk'],
 ['drive',
  'first',
  'mhz',
  'chip',
  'recently',
  'religious',
  'without',
  'kill',
  'connection',
  'certain'],
 ['mail',
  'contact',
  'dana',
  'program',
  'upgrade',
  'email',
  'missing',
  'went',
  'add',
  'claim']]