In [1]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re

stop_words = list(set(stopwords.words('english')))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[{}0-9]'.format(string.punctuation), ' ', text)
    text=re.sub(r'[^A-Za-z0-9 ]+', ' ', text)
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    text = [WordNetLemmatizer().lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

In [2]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import swifter

newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
df=pd.DataFrame({"content":newsgroups["data"]})


df["content"]=df["content"].swifter.apply(lambda x: preprocess_text(x))
df['content_length'] = df['content'].str.len()

df = df[df['content_length'] > 100]
df = df[df['content_length'] < 2000]

df=df[["content"]].reset_index(drop=True).reset_index().rename(columns={"index":"id"})
documents=df.content.to_list()


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

In [3]:
num_topics=50

In [4]:
from embedded_topic_model.utils import preprocessing,embedding

# Preprocessing the dataset
vocabulary, train_dataset, _, = preprocessing.create_etm_datasets(
    documents,
    min_df=0.01,
    max_df=0.75,
    train_size=0.85,
)


wv = embedding.create_word2vec_embedding_from_dataset(documents)

In [5]:
from embedded_topic_model.models.etm import ETM

# Training an ETM instance
etm_instance = ETM(
    vocabulary,
    embeddings=wv, # You can pass here the path to a word2vec file or a KeyedVectors instance
    num_topics=num_topics,
    epochs=300,
    debug_mode=True,
    train_embeddings=False, # Optional. If True, ETM will learn word embeddings jointly with
                            # topic embeddings. By default, is False. If 'embeddings' argument
                            # is being passed, this argument must not be True
)

etm_instance.fit(train_dataset)
#etm_instance._save_model("./results/etm/200K")

Topics before training: [['since', 'good', 'force', 'ii', 'run', 'standard', 'everything', 'product', 'want', 'drop'], ['manual', 'much', 'could', 'memory', 'couple', 'keep', 'btw', 'designed', 'force', 'love']]
Epoch 1 - Learning Rate: 0.005 - KL theta: 0.0 - Rec loss: 268.74 - NELBO: 268.74
Epoch 2 - Learning Rate: 0.005 - KL theta: 0.06 - Rec loss: 267.55 - NELBO: 267.61
Epoch 3 - Learning Rate: 0.005 - KL theta: 0.01 - Rec loss: 266.56 - NELBO: 266.57
Epoch 4 - Learning Rate: 0.005 - KL theta: 0.0 - Rec loss: 265.55 - NELBO: 265.55
Epoch 5 - Learning Rate: 0.005 - KL theta: 0.0 - Rec loss: 264.64 - NELBO: 264.64
Epoch 6 - Learning Rate: 0.005 - KL theta: 0.0 - Rec loss: 263.76 - NELBO: 263.76
Epoch 7 - Learning Rate: 0.005 - KL theta: 0.01 - Rec loss: 263.01 - NELBO: 263.02
Epoch 8 - Learning Rate: 0.005 - KL theta: 0.01 - Rec loss: 262.23 - NELBO: 262.24
Epoch 9 - Learning Rate: 0.005 - KL theta: 0.02 - Rec loss: 261.55 - NELBO: 261.57
Epoch 10 - Learning Rate: 0.005 - KL theta: 0

<embedded_topic_model.models.etm.ETM at 0x13fa4beb0>

In [6]:
etm_instance.get_topics()

[['one',
  'would',
  'people',
  'right',
  'like',
  'time',
  'think',
  'know',
  'year',
  'even'],
 ['work',
  'get',
  'problem',
  'use',
  'system',
  'would',
  'file',
  'like',
  'one',
  'please']]