In [26]:
from glob import glob
import json
from sklearn.feature_extraction.text import (
    TfidfVectorizer,
    CountVectorizer
)
from sklearn.decomposition import (
    NMF,
    LatentDirichletAllocation
)

In [89]:
no_features = 100
no_topics = 5
no_top_words = 10
docsdir = "../arxiv_papers_infos/"

In [90]:
filenames = glob(f"{docsdir}/*.json")

In [91]:
documents = []
for filename in filenames:
    with open(filename, "r") as f:
        data = json.load(f)
    data = f"{data['result']['title']} -- {data['result']['abstract']}"
    documents.append(data)

In [92]:
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=no_features,
    stop_words="english"
)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()



In [63]:
import numpy as np
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

In [99]:
X = tfidf.toarray()

In [77]:
def get_tsne():
    tsne = TSNE(
        n_components=2,
        learning_rate='auto',
        init='random',
        perplexity=3
    )
    return tsne

def plot(X):
    plt.scatter(X[:, 0], X[:, 1])

In [100]:
tsne = get_tsne()

In [101]:
X_tsne = tsne.fit_transform(X)

In [111]:
# plt.scatter(X_tsne[:, 0], X_tsne[:, 1])

In [114]:
src_idx = 4
dists = np.linalg.norm(X_tsne[src_idx] - X_tsne, axis=1)
sim_ids = np.argsort(dists)

In [115]:
for idx in range(10):
    print(documents[sim_ids[idx]].split("--")[0], end="\n\n")

multi-task deep cnn model for no-reference image quality assessment on smartphone camera photos 

ams_adrn at semeval-2022 task 5: a suitable image-text multimodal joint modeling method for multi-task misogyny identification 

upb at semeval-2022 task 5: enhancing uniter with image sentiment and graph convolutional networks for multimedia automatic misogyny identification 

generating fact checking explanations 

nlp-cuet@dravidianlangtech-eacl2021: investigating visual and textual features to identify trolls from multimodal social media memes 

identification of social-media platform of videos through the use of shared features 

similarity learning for authorship verification in social media 

question retrieval for community-based question answering via heterogeneous network integration learning 

short text topic modeling: application to tweets about bitcoin 

sexism prediction in spanish and english tweets using monolingual and multilingual bert and ensemble models 



In [93]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=no_features,
    stop_words="english"
)
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [94]:
# Run NMF
nmf = NMF(
    n_components=no_topics,
    random_state=1,
    alpha=.1,
    l1_ratio=.5,
    init='nndsvd'
).fit(tfidf)



In [95]:
# Run LDA
lda = LatentDirichletAllocation(
    n_components=no_topics,
    max_iter=5,
    learning_method='online',
    learning_offset=50.,
    random_state=0
).fit(tf)

In [96]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join(
                [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
            )
        )
    print()

In [98]:
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
learning data detection media neural deep text models based task
Topic 1:
network networks influence diffusion model nodes information dynamics spreading spread
Topic 2:
users user twitter online content media information political platforms tweets
Topic 3:
news fake detection information media spread political content features twitter
Topic 4:
covid 19 spread people public tweets analysis related sentiment data

Topic 0:
data learning media model based neural models network deep text
Topic 1:
news detection graph fake information media based networks network content
Topic 2:
media twitter data analysis network political opinion study information online
Topic 3:
users user content online 19 covid images networks network based
Topic 4:
network networks model information influence diffusion problem nodes spread based

