In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install scikit-learn --upgrade



In [3]:
import sklearn
sklearn.__version__

'1.4.1.post1'

# Loading and cleaning the graph

In [4]:
import networkx as nx
import json

In [6]:
with open("./arxiv-metadata-oai-snapshot.json") as file:
    graph_dict = json.load(file)

JSONDecodeError: Extra data: line 2 column 1 (char 1689)

In [None]:
with open("/kaggle/input/arxiv-pda/papers.json") as file:
    papers = json.load(file)

In [None]:
import pandas as pd

df_papers = pd.DataFrame(papers).T
df_papers.head()

In [None]:
with open("/kaggle/input/arxiv-pda/unique_original_ids.json") as file:
    unique_original_ids = json.load(file)

In [None]:
missing_ids = set([k for k in graph_dict.keys() if k not in papers])
len(missing_ids)

In [None]:
for k, refs in graph_dict.items():
    if k in missing_ids:
        print("k", k)
    for r in refs:
        if r in missing_ids:
            print("r", r)

In [None]:
for id in missing_ids:
    del graph_dict[id]

In [None]:
graph = nx.Graph(graph_dict)

# Minimal graph analysis

In [None]:
graph.number_of_nodes(), nx.number_connected_components(graph)

In [None]:
len(next(nx.connected_components(graph)))

In [None]:
sum([len(x) for x in nx.connected_components(graph)][1:]), len(graph_dict.keys())

In [None]:
componenets = sorted(nx.connected_components(graph), key=len, reverse=True)
big_component = graph.subgraph(componenets[0])
len(big_component)

In [None]:
nx.density(big_component)  # sparse graph

# Clustering and plotting

In [None]:
nodes = list(big_component)
L = nx.laplacian_matrix(big_component, nodelist=nodes)
L.shape

In [None]:
unique_original_ids = set(unique_original_ids)

In [None]:
#unique_original_ids = set(df_papers[df_papers["year"] == 2023]["paperId"].tolist())
len(unique_original_ids)

In [None]:
indices = [i for i, x in enumerate(nodes) if x in unique_original_ids]
len(indices)

In [None]:
from scipy.sparse.linalg import eigsh
eigenvals, eigenvecs = eigsh(L.astype(float), k=30)
eigenvecs = eigenvecs.real
eigenvecs.shape

In [None]:
from sklearn.cluster import KMeans, HDBSCAN

kmeans = KMeans(n_clusters=8, random_state=42, n_init="auto").fit(eigenvecs[indices, :])
#hdbscan = HDBSCAN(n_jobs=-1).fit(eigenvecs[indices, :])

In [None]:
import numpy as np
np.unique(kmeans.labels_, return_counts=True)

In [None]:
import umap

transformed = umap.UMAP(
    verbose=True,
    #metric="cosine"
).fit_transform(eigenvecs[indices, :]).T

In [None]:
import matplotlib.pyplot as plt

plt.scatter(transformed[0], transformed[1], c=kmeans.labels_, s=0.1)
plt.show()

In [None]:
import seaborn as sns

sns.scatterplot(
    x=transformed[0],
    y=transformed[1],
    hue=kmeans.labels_,
)
plt.show()

## Cluster content analysis

In [None]:
df_orig = df_papers.loc[nodes].iloc[indices].copy()
df_orig

In [None]:
df_orig["cluster"] = kmeans.labels_
df_orig

In [None]:
df_orig["cluster"].value_counts()

In [None]:
df_orig[df_orig["cluster"] == 0][["title", "abstract"]].reset_index(drop=True).iloc[10:15].style

In [None]:
# https://stackoverflow.com/questions/73849624/getting-error-while-submitting-notebook-on-kaggle-even-after-importing-nltk-libr

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import subprocess

# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))# + ['ha', 'wa', 'say', 'said'])
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess(text):
    text = list(filter(str.isalpha, word_tokenize(text.lower())))
    text = list(lemmatizer.lemmatize(word) for word in text)
    text = list(word for word in text if word not in stop_words)
    return ' '.join(text)

df_orig["title"].fillna("", inplace=True)
df_orig["abstract"].fillna("", inplace=True)

df_orig['raw_text'] = df_orig.apply(lambda row: row["title"] + row["abstract"], axis=1)
df_orig['text'] = df_orig.apply(lambda row: preprocess(row["raw_text"]), axis=1)
df_orig

In [None]:
pip install wordcloud

In [None]:
from wordcloud import WordCloud

def draw_wordcloud(texts, max_words=1000, width=1000, height=500):
    wordcloud = WordCloud(background_color='white', max_words=max_words,
                          width=width, height=height)

    joint_texts = ' '.join(list(texts))
    wordcloud.generate(joint_texts)
    return wordcloud.to_image()

In [None]:
draw_wordcloud(df_orig[df_orig["cluster"] == 0]["text"])

In [None]:
draw_wordcloud(df_orig[df_orig["cluster"] == 1]["text"])

In [None]:
draw_wordcloud(df_orig[df_orig["cluster"] == 2]["text"])

In [None]:
draw_wordcloud(df_orig[df_orig["cluster"] == 3]["text"])

In [None]:
draw_wordcloud(df_orig[df_orig["cluster"] == 4]["text"])

In [None]:
draw_wordcloud(df_orig[df_orig["cluster"] == 5]["text"])

In [None]:
draw_wordcloud(df_orig[df_orig["cluster"] == 6]["text"])

In [None]:
draw_wordcloud(df_orig[df_orig["cluster"] == 7]["text"])