In [1]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import swifter
import numpy as np
from top2vec import Top2Vec

In [2]:

stop_words = list(set(stopwords.words('english')))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[{}0-9]'.format(string.punctuation), ' ', text)
    text=re.sub(r'[^A-Za-z0-9 ]+', ' ', text)
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    text = [WordNetLemmatizer().lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

In [2]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
df=pd.DataFrame({"content":newsgroups["data"]})

df["content"]=df["content"].swifter.apply(lambda x: preprocess_text(x))
df['content_length'] = df['content'].str.len()

df = df[df['content_length'] > 100]
df = df[df['content_length'] < 2000]

df=df[["content"]].reset_index(drop=True).reset_index().rename(columns={"index":"id"})
documents=df.content.to_list()

Pandas Apply:   0%|          | 0/18846 [00:00<?, ?it/s]

In [6]:
model = Top2Vec(documents=documents,speed="learn", workers=12)
model.save("top2vec_20newgroup")
#model = Top2Vec.load("filename")

2023-04-12 15:49:15,887 - top2vec - INFO - Pre-processing documents for training
2023-04-12 15:49:17,520 - top2vec - INFO - Creating joint document/word embedding
2023-04-12 15:49:56,684 - top2vec - INFO - Creating lower dimension embedding of documents
2023-04-12 15:49:59,511 - top2vec - INFO - Finding dense areas of documents
2023-04-12 15:49:59,705 - top2vec - INFO - Finding topics


In [6]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

def count_remover(text,threshold=4):
    if len(text.split())<threshold:
        return pd.NaT
    else:
        return text

data=pd.read_json('tweets.json' ,lines=True)
df=data[["Text","CreatedAt"]].rename(columns={"Text":"content","CreatedAt":"time"})
df['content'] = df['content'].str.replace(r'@\w+', '')
df['content'] = df['content'].apply(lambda x: re.sub(r"http\S+", "", x))
df['content'] = df['content'].apply(lambda x: remove_punct(x))
df['content'] = df['content'].apply(lambda x: count_remover(x))
df=df.dropna()
df["content"]=df["content"].swifter.apply(lambda x: preprocess_text(x))
df=df.dropna()
documents=df.content.tolist()

model = Top2Vec(documents=df.content.tolist(),speed="deep-learn", workers=12)
#model.save("top2vec_EM_tweets")

Pandas Apply:   0%|          | 0/14268 [00:00<?, ?it/s]

2023-04-26 10:55:17,622 - top2vec - INFO - Pre-processing documents for training
2023-04-26 10:55:17,840 - top2vec - INFO - Creating joint document/word embedding
2023-04-26 10:57:29,451 - top2vec - INFO - Creating lower dimension embedding of documents
2023-04-26 10:57:32,118 - top2vec - INFO - Finding dense areas of documents
2023-04-26 10:57:32,413 - top2vec - INFO - Finding topics


In [7]:
topics=list(model.get_topics()[0])
topics=[topic[:10] for topic in topics]
print(len(topics))

documents_distribution=[]
for document in model.doc_top:
    document_dist=np.zeros(len(topics))
    document_dist[document]=1
    documents_distribution.append(document_dist)
documents_topic_distribution_array = np.vstack(documents_distribution)

164


In [9]:
import pickle
with open("results/top2vec_EM_tweets_topics", "wb") as fp:   #Pickling
     pickle.dump(topics, fp)

In [4]:
model = Top2Vec.load("results/top2vec_20newgroup")

In [5]:
topics = list(model.get_topics()[0])
topics = [topic[:10] for topic in topics]
print(len(topics))

documents_distribution = []
for document in model.doc_top:
    document_dist = np.zeros(len(topics))
    document_dist[document] = 1
    documents_distribution.append(document_dist)
documents_topic_distribution_array = np.vstack(documents_distribution)
topics

85


[array(['bike', 'car', 'tire', 'rear', 'engine', 'riding', 'brake', 'ride',
        'honda', 'mile'], dtype='<U14'),
 array(['christ', 'god', 'jesus', 'bible', 'christian', 'church', 'lord',
        'heaven', 'scripture', 'faith'], dtype='<U14'),
 array(['god', 'atheist', 'belief', 'religion', 'christian',
        'christianity', 'bible', 'truth', 'moral', 'faith'], dtype='<U14'),
 array(['patient', 'infection', 'doctor', 'treatment', 'disease', 'diet',
        'medical', 'medicine', 'yeast', 'symptom'], dtype='<U14'),
 array(['pitching', 'hitter', 'pitcher', 'inning', 'hit', 'batting',
        'team', 'season', 'dodger', 'game'], dtype='<U14'),
 array(['encryption', 'clipper', 'secure', 'nsa', 'encrypted', 'wiretap',
        'escrow', 'privacy', 'cryptography', 'crypto'], dtype='<U14'),
 array(['fbi', 'compound', 'koresh', 'bd', 'batf', 'waco', 'atf',
        'davidians', 'grenade', 'assault'], dtype='<U14'),
 array(['orbit', 'space', 'launch', 'shuttle', 'moon', 'nasa', 'mission',
  

In [9]:
import pickle
with open("results/top2vec_EM_tweets_topics", "wb") as fp:   #Pickling
     pickle.dump(topics, fp)
with open("results/top2vec_EM_tweets_top_doc_dist", "wb") as fp:   #Pickling
     pickle.dump(documents_topic_distribution_array, fp)