In [3]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import swifter
import numpy as np
import gensim



stop_words = list(set(stopwords.words('english')))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[{}0-9]'.format(string.punctuation), ' ', text)
    text=re.sub(r'[^A-Za-z0-9 ]+', ' ', text)
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    text = [WordNetLemmatizer().lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

In [4]:
#Read dataset of Elon Musk Tweets

def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

def count_remover(text,threshold=4):
    if len(text.split())<threshold:
        return pd.NaT
    else:
        return text

data=pd.read_json('NTM/tweets.json' ,lines=True)
df=data[["Text","CreatedAt"]].rename(columns={"Text":"content","CreatedAt":"time"})
df['content'] = df['content'].str.replace(r'@\w+', '')
df['content'] = df['content'].apply(lambda x: re.sub(r"http\S+", "", x))
df['content'] = df['content'].apply(lambda x: remove_punct(x))
df['content'] = df['content'].apply(lambda x: count_remover(x))
df=df.dropna()
df["content"]=df["content"].swifter.apply(lambda x: preprocess_text(x))
df=df.dropna()
documents_EM_tweets=df.content.tolist()

tokenized_docs = [word_tokenize(document.lower()) for document in documents_EM_tweets]
model = gensim.models.Word2Vec(tokenized_docs, vector_size=300, window=5, min_count=1, workers=12)
wv_EM_tweets=model.wv

Pandas Apply:   0%|          | 0/14268 [00:00<?, ?it/s]

In [7]:
#Read dataset of 20Newsgroup


newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
df=pd.DataFrame({"content":newsgroups["data"]})

df["content"]=df["content"].swifter.apply(lambda x: preprocess_text(x))
df['content_length'] = df['content'].str.len()

df = df[df['content_length'] > 100]
df = df[df['content_length'] < 2000]

df=df[["content"]].reset_index(drop=True).reset_index().rename(columns={"index":"id"})
documents_20newsgroup=df.content.to_list()
tokenized_docs = [word_tokenize(document.lower()) for document in documents_20newsgroup]
model = gensim.models.Word2Vec(tokenized_docs, vector_size=300, window=5, min_count=1, workers=12)
wv_20newsgroup=model.wv

Pandas Apply:   0%|          | 0/18846 [00:00<?, ?it/s]

In [5]:
from sklearn.metrics import pairwise_distances
def word_embedding_coherence(topics,wv,topk=10):
    result = 0.0
    for topic in topics:
        E = []
        for word in topic[:topk]:
            if word in wv:
                try:
                    word_embedding = wv[word].numpy()
                except:
                    word_embedding = wv[word]
                normalized_we = word_embedding / word_embedding.sum()
                E.append(normalized_we)

        if len(E) > 0:
            E = np.array(E)
            # Perform cosine similarity between E rows
            distances = np.sum(1-pairwise_distances(E, metric='cosine') - np.diag(np.ones(len(E))))
            topic_coherence = distances/(topk*(topk-1))
        else:
            topic_coherence = -1
            # Update result with the computed coherence of the topic
        result += topic_coherence
    result = result/len(topics)
    return result

In [8]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import os
import pickle

ntm_results_path = "./NTM/results/"
coherence_result = "coherence_result4.txt"

with open(coherence_result, "w") as f:
    for documents in [(documents_EM_tweets,"tweets",wv_EM_tweets),(documents_20newsgroup,"20newsgroup",wv_20newsgroup)]:
        documents_tokens=[doc.split() for doc in documents[0]]
        dictionary = Dictionary(documents_tokens)
        corpus_bow = [dictionary.doc2bow(doc_tokens) for doc_tokens in documents_tokens]
        for filename in os.listdir(ntm_results_path):
            name=filename.split("_")
            if 'topics' in name and documents[1] in name:
                with open("NTM/results/"+str(filename), "rb") as fp:
                    topics = pickle.load(fp)
                for i in ["u_mass","c_v","c_uci","c_npmi"]:
                    cm = CoherenceModel(topics=topics, corpus=corpus_bow, dictionary=dictionary, texts=documents_tokens, coherence=i)
                    coherence=cm.get_coherence()
                    f.write(f"{filename}, {i}, {coherence}\n")
                f.write(f"{filename}, word_embedding_coherence, {word_embedding_coherence(topics,documents[2],topk=10)}\n")
    f.write("The experiment is successfully done!")