In [1]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re

stop_words = list(set(stopwords.words('english')))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[{}0-9]'.format(string.punctuation), ' ', text)
    text=re.sub(r'[^A-Za-z0-9 ]+', ' ', text)
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    text = [WordNetLemmatizer().lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

In [2]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from nltk.tokenize import word_tokenize
import swifter

newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
df=pd.DataFrame({"content":newsgroups["data"]})

df=df.sample(1000) #for a sample of 1000 documents
df["content"]=df["content"].swifter.apply(lambda x: preprocess_text(x))
df['content_length'] = df['content'].str.len()

df = df[df['content_length'] > 100]
df = df[df['content_length'] < 2000]

df=df[["content"]].reset_index(drop=True).reset_index().rename(columns={"index":"id"})
documents=df.content.to_list()

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

In [3]:
from top2vec import Top2Vec
model = Top2Vec(documents=documents, speed="deep-learn", workers=12)

2023-03-31 11:13:09,941 - top2vec - INFO - Pre-processing documents for training
2023-03-31 11:13:10,008 - top2vec - INFO - Creating joint document/word embedding
2023-03-31 11:13:17,385 - top2vec - INFO - Creating lower dimension embedding of documents
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2023-03-31 11:13:20,430 - top2vec - INFO - Finding dense areas of documents
2023-03-31 11:13:20,440 - top2vec - INFO - Finding topics


In [4]:
import numpy as np

topics=list(model.get_topics()[0])
topics=[topic[:10] for topic in topics]
print(len(topics))

documents_distribution=[]
for document in model.doc_top:
    document_dist=np.zeros(len(topics))
    document_dist[document]=1
    documents_distribution.append(document_dist)
documents_topic_distribution_array = np.vstack(documents_distribution)

11


In [5]:
topics

[array(['believe', 'mean', 'see', 'government', 'law', 'every', 'state',
        'christian', 'never', 'even'], dtype='<U11'),
 array(['better', 'team', 'game', 'well', 'really', 'least', 'group',
        'course', 'tell', 'still'], dtype='<U11'),
 array(['book', 'software', 'do', 'place', 'price', 'mail', 'edu', 'great',
        'looking', 'information'], dtype='<U11'),
 array(['something', 'take', 'number', 'rather', 'try', 'thing', 'someone',
        'wrong', 'stuff', 'must'], dtype='<U11'),
 array(['software', 'disk', 'using', 'program', 'mail', 'do', 'window',
        'file', 'work', 'looking'], dtype='<U11'),
 array(['god', 'word', 'come', 'found', 'seems', 'far', 'might', 'little',
        'even', 'mean'], dtype='<U11'),
 array(['bike', 'go', 'look', 'etc', 'seems', 'wrong', 'thing', 'give',
        'must', 'something'], dtype='<U11'),
 array(['driver', 'got', 'use', 'card', 'help', 'window', 'file', 'sound',
        'anyone', 'please'], dtype='<U11'),
 array(['year', 'card', 'c