In [1]:
from glob import glob
import json
from sklearn.feature_extraction.text import (
    TfidfVectorizer,
    CountVectorizer
)
from sklearn.decomposition import (
    NMF,
    LatentDirichletAllocation
)

In [2]:
no_features = 100
no_topics = 10
no_top_words = 10
docsdir = "../arxiv_papers_infos/"

In [3]:
filenames = glob(f"{docsdir}/*.json")

In [4]:
documents = []
titles = []
for filename in filenames:
    with open(filename, "r") as f:
        data = json.load(f)
    titles.append(data['result']['title'])
    data = f"{data['result']['title']} -- {data['result']['abstract']}"
    documents.append(data)

In [5]:
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=no_features,
    stop_words="english"
)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()



In [6]:
# https://huggingface.co/sentence-transformers
from sentence_transformers import SentenceTransformer

In [7]:
transf = SentenceTransformer("paraphrase-MiniLM-L6-v2")

In [8]:
trf_emb = transf.encode(documents)

In [9]:
import numpy as np
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

In [10]:
# X = tfidf.toarray()
X = trf_emb

In [11]:
def get_tsne():
    tsne = TSNE(
        n_components=3,
        learning_rate='auto',
        init='random',
        perplexity=3
    )
    return tsne

def plot(X):
    plt.scatter(X[:, 0], X[:, 1])

In [12]:
tsne = get_tsne()

In [13]:
X_tsne = tsne.fit_transform(X)

In [21]:
# plt.scatter(X_tsne[:, 0], X_tsne[:, 1])

In [18]:
# import plotly.plotly as py
# import plotly.graph_objs as pgo
import plotly.express as px
import pandas as pd

In [17]:
plt_data = pd.DataFrame(
    {
        "x_tsne": X_tsne[:, 0],
        "y_tsne": X_tsne[:, 1],
        "title": titles,
    }
)

fig = px.scatter(
    plt_data,
    x="x_tsne",
    y="y_tsne",
    hover_name="title",
)
fig.show()

In [37]:
src_idx = 4
dists = np.linalg.norm(X_tsne[src_idx] - X_tsne, axis=1)
sim_ids = np.argsort(dists)

In [38]:
for idx in range(10):
    print(documents[sim_ids[idx]].split("--")[0], end="\n\n")

multi-task deep cnn model for no-reference image quality assessment on smartphone camera photos 

a counter-forensic method for cnn-based camera model identification 

cnn-based fast source device identification 

the forchheim image database for camera identification in the wild 

analysis of adversarial attacks against cnn-based image forgery detectors 

spatiotemporal cnns for pornography detection in videos 

deep learning methods for event verification and image repurposing detection 

deepfirearm: learning discriminative feature representation for fine-grained firearm retrieval 

evading deepfake-image detectors with white- and black-box attacks 

improving the authentication with built-in camera protocol using built-in motion sensors: a deep learning solution 



In [128]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=no_features,
    stop_words="english"
)
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [28]:
# Run NMF
nmf = NMF(
    n_components=no_topics,
    random_state=1,
    alpha=.1,
    l1_ratio=.5,
    init='nndsvd'
).fit(tfidf)



In [130]:
# Run LDA
lda = LatentDirichletAllocation(
    n_components=no_topics,
    max_iter=5,
    learning_method='online',
    learning_offset=50.,
    random_state=0
).fit(tf)

In [29]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join(
                [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
            )
        )
    print()

In [None]:
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)