In [24]:
import numpy as np
import pandas as pd
from tqdm import tqdm

# Load data

In [25]:
year = 2024
df_paper = pd.read_csv(f'paperlist_{year}.tsv', index_col=0, sep='\t')
print('# papers:', len(df_paper))
df_paper.head()

# papers: 2401


Unnamed: 0_level_0,title,link,keywords,abstract
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cXs5md5wAq,Modelling Microbial Communities with Graph Neu...,https://openreview.net/forum?id=cXs5md5wAq,"graph neural networks, microbial communities, ...",Understanding the interactions and interplay o...
rhgIgTSSxW,TabR: Tabular Deep Learning Meets Nearest Neig...,https://openreview.net/forum?id=rhgIgTSSxW,"tabular, tabular data, architecture, deep lear...",Deep learning (DL) models for tabular data pro...
kKRbAY4CXv,Neural Evolutionary Kernel Method: A Knowledge...,https://openreview.net/forum?id=kKRbAY4CXv,"Numerical PDE, structure preserving neural net...",Numerical solution of partial differential equ...
ApjY32f3Xr,PINNacle: A Comprehensive Benchmark of Physics...,https://openreview.net/forum?id=ApjY32f3Xr,"PINN, machine learning, physics-informed machi...",While significant progress has been made on Ph...
eUgS9Ig8JG,SaNN: Simple Yet Powerful Simplicial-aware Neu...,https://openreview.net/forum?id=eUgS9Ig8JG,"Graph Neural Networks, Higher-order Representa...",Simplicial neural networks (SNNs) are deep mod...


In [26]:
df_rating = pd.read_csv(f'ratings.tsv', index_col=0, sep='\t')
print('# ratings:', len(df_rating))
df_rating.head()

# ratings: 2401


Unnamed: 0_level_0,rating
paper_id,Unnamed: 1_level_1
cXs5md5wAq,4.5
rhgIgTSSxW,5.75
kKRbAY4CXv,4.25
ApjY32f3Xr,5.25
eUgS9Ig8JG,7.0


In [27]:
# merge paper and rating by paper id
df = pd.merge(df_paper, df_rating, on='paper_id')
print('# merged:', len(df))
df.head()

# merged: 2401


Unnamed: 0_level_0,title,link,keywords,abstract,rating
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cXs5md5wAq,Modelling Microbial Communities with Graph Neu...,https://openreview.net/forum?id=cXs5md5wAq,"graph neural networks, microbial communities, ...",Understanding the interactions and interplay o...,4.5
rhgIgTSSxW,TabR: Tabular Deep Learning Meets Nearest Neig...,https://openreview.net/forum?id=rhgIgTSSxW,"tabular, tabular data, architecture, deep lear...",Deep learning (DL) models for tabular data pro...,5.75
kKRbAY4CXv,Neural Evolutionary Kernel Method: A Knowledge...,https://openreview.net/forum?id=kKRbAY4CXv,"Numerical PDE, structure preserving neural net...",Numerical solution of partial differential equ...,4.25
ApjY32f3Xr,PINNacle: A Comprehensive Benchmark of Physics...,https://openreview.net/forum?id=ApjY32f3Xr,"PINN, machine learning, physics-informed machi...",While significant progress has been made on Ph...,5.25
eUgS9Ig8JG,SaNN: Simple Yet Powerful Simplicial-aware Neu...,https://openreview.net/forum?id=eUgS9Ig8JG,"Graph Neural Networks, Higher-order Representa...",Simplicial neural networks (SNNs) are deep mod...,7.0


In [28]:
# extract papers with rating >= 6, which are highly possible to be accepted
df = df[df['rating'] >= 6]
print('# filtered:', len(df))
df.head()

# filtered: 790


Unnamed: 0_level_0,title,link,keywords,abstract,rating
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
eUgS9Ig8JG,SaNN: Simple Yet Powerful Simplicial-aware Neu...,https://openreview.net/forum?id=eUgS9Ig8JG,"Graph Neural Networks, Higher-order Representa...",Simplicial neural networks (SNNs) are deep mod...,7.0
qBL04XXex6,Boosting of Thoughts: Trial-and-Error Problem ...,https://openreview.net/forum?id=qBL04XXex6,Large Language Models; Prompt Engineering; Boo...,The reasoning performance of Large Language Mo...,6.0
kmn0BhQk7p,Beyond Memorization: Violating Privacy via Inf...,https://openreview.net/forum?id=kmn0BhQk7p,"Privacy, Large Language Models",Current privacy research on large language mod...,7.2
i8PjQT3Uig,Locality Sensitive Sparse Encoding for Learnin...,https://openreview.net/forum?id=i8PjQT3Uig,"model-based rl, online learning, incremental l...",Model-based reinforcement learning (MBRL) is k...,6.666667
E5CMyG6jl0,Unified Language Model Alignment with Demonstr...,https://openreview.net/forum?id=E5CMyG6jl0,"Large Language Model, Alignment, Point-wise pr...",Language model alignment is a cutting-edge tec...,6.0


In [29]:
# df.index
# how to check the index a df?
df.index.values
# how to access index 'eUgS9Ig8JG' of df?
df.loc['eUgS9Ig8JG'].rating

for index in df.index.values:
    print(index, df.loc[index].rating)

eUgS9Ig8JG 7.0
qBL04XXex6 6.0
kmn0BhQk7p 7.2
i8PjQT3Uig 6.666666666666667
E5CMyG6jl0 6.0
7vVWiCrFnd 6.6
bDWXhzZT40 6.666666666666667
DwcV654WBP 6.5
SLw9fp4yI6 7.0
My7lkRNnL9 6.5
B0wJ5oCPdB 6.0
BTKAeLqLMw 6.333333333333333
AJBkfwXh3u 6.0
TPZRq4FALB 8.0
78iGZdqxYY 6.0
elMKXvhhQ9 7.0
Iyve2ycvGZ 6.0
Ebt7JgMHv1 6.333333333333333
YItWKZci78 6.8
lF2aip4Scn 6.5
vESNKdEMGp 6.4
csukJcpYDe 7.5
RzNlECeoOB 7.333333333333333
FI0vOp2asx 7.0
nTwb2vBLOV 6.0
FMMF1a9ifL 6.5
buC4E91xZE 6.166666666666667
LojXXo2xaf 6.0
AqN23oqraW 6.75
hv3SklibkL 6.0
idpV2AqusC 6.25
Dxl0EuFjlf 6.0
OsGUnYOzii 6.5
wkbeqr5XhC 6.0
wfgZc3IMqo 6.0
3SJE1WLB4M 8.0
dCHbFDsCZz 6.0
EmQSOi1X2f 6.0
lhZEodF8Dn 7.0
Ffjc8ApSbt 6.666666666666667
ADDCErFzev 6.0
327tbF3S65 6.0
uz7d2N2zul 6.333333333333333
uREj4ZuGJE 6.75
J44HfH4JCg 7.5
FHqAzWl2wE 6.0
KS8mIvetg2 7.5
JiTVtCUOpS 6.0
GN921JHCRw 7.0
HX5ujdsSon 6.666666666666667
6xfe4IVcOu 7.0
IjMUGuUmBI 6.8
yarUvgEXq3 7.333333333333333
3K3s9qxSn7 6.333333333333333
Ixi4j6LtdX 6.75
k581sTMyPt 6.6666

In [98]:
class PaperNode:
    """PaperNode class for storing paper information
    """
    def __init__(self, paper_id, title, link, keyword_ls, rating=0, abstract=None):
        """__init__ function for PaperNode

        Args:
            paper_id (str): unique id for each paper
            title (str): title of the paper
            link (str): link to the paper
            keyword_ls (list of str): list of keywords
            abstract (str, optional): abstract of the paper. Defaults to None.
        """
        self.paper_id = paper_id
        self.title = title
        self.link = link
        self.keyword_ls = keyword_ls
        self.rating = rating # optional
        self.abstract = abstract # optional
        self.connectedTo = {}
        self.connected_components = []

    def __str__(self):
        return self.title
    
    def addNeighbor(self, neighbor_id, weight=0):
        """add neighbor to the paper

        Args:
            neighbor_id (str): neighbor id
            weight (int, optional): weight of the edge. Defaults to 0.
        """
        self.connectedTo[neighbor_id] = weight
    
    def calWeight(self, another_paper_node):
        """calculate the weight between two papers

        Args:
            another_paper_node (PaperNode): another paper node

        Returns:
            weight (int): weight between two papers
        """
        weight = 0
        for keyword in another_paper_node.keyword_ls:
            if keyword in self.keyword_ls:
                weight += 1
        return weight

class PaperGraph:
    """PaperGraph class for storing papers and their connections
    """
    def __init__(self):
        self.paperDict = {}
        self.numPapers = 0
    
    def addPaper(self, paper_id, paper: PaperNode):
        """add paper to the graph

        Args:
            paper_id (str): paper id
            paper (PaperNode): paper node
        """
        self.numPapers += 1
        self.paperDict[paper_id] = paper
    
    def addPaperDict(self, paper_dict: dict):
        """add paper dictionary to the graph

        Args:
            paper_dict (dict): paper dictionary
        """
        self.paperDict = paper_dict
    
    def getPaper(self, paper_id):
        """get paper by paper id

        Args:
            paper_id (str): paper id

        Returns:
            PaperNode: paper node
        """
        if paper_id in self.paperDict:
            return self.paperDict[paper_id]
        else:
            return None
    
    def addEdge(self, paper1: PaperNode, paper2: PaperNode, weight=0, threshold=1):
        """add edge between two papers

        Args:
            paper1 (PaperNode): paper node 1
            paper2 (PaperNode): paper node 2
            weight (int, optional): weight of the edge. Defaults to 0.
            threshold (int, optional): threshold of the weight, edge with weight below it will be ignored. Defaults to 1.
        """
        if weight >= threshold and paper1.paper_id != paper2.paper_id:
            self.paperDict[paper1.paper_id].addNeighbor(self.paperDict[paper2.paper_id].paper_id, weight)
            self.paperDict[paper2.paper_id].addNeighbor(self.paperDict[paper1.paper_id].paper_id, weight)
    
    def getPaperDict(self):
        """get the paper dictionary"""
        return self.paperDict
    
    def getConnection(self):
        """get the connection of the graph"""
        for paper_id in self.paperDict:
            # also include the weights
            print(self.paperDict[paper_id].paper_id, self.paperDict[paper_id].connectedTo)

    def dfs(self, paper_id, visited):
        """dfs function for finding connected components

        Args:
            paper_id (str): paper id
            visited (set): set of visited paper ids 
        """
        visited.add(paper_id)
        for neighbor in self.paperDict[paper_id].connectedTo:
            if neighbor not in visited:
                self.dfs(neighbor, visited)
    
    def findConnectedComponents(self):
        """find connected components in the graph

        Returns:
            connected_components (list of set): list of connected components
        """
        visited = set()
        connected_components = []

        for paper_id in self.paperDict:
            if paper_id not in visited:
                component = set()
                self.dfs(paper_id, component)
                connected_components.append(component)
        
        return connected_components

In [99]:
paperIDs = df.index.values
paperDict = {}
numPapers = len(df)
paperGraph = PaperGraph()

for id in paperIDs:
    title = df.loc[id].title
    link = df.loc[id].link
    keyword_ls = [keyword.strip() for keyword in df.loc[id].keywords.split(',')]
    rating = df.loc[id].rating
    paperDict[id] = PaperNode(id, title, link, keyword_ls, rating)
    paperGraph.addPaper(id, paperDict[id])

# len(paperDict)
# paperDict['eUgS9Ig8JG'].keyword_ls
print(paperGraph.numPapers == numPapers)

print(len(paperIDs))


for i in tqdm(range(len(paperIDs))):
    for j in range(len(paperIDs)):
        paper1 = paperDict[paperIDs[i]]
        paper2 = paperDict[paperIDs[j]]
        weight = paper1.calWeight(paper2)
        paperGraph.addEdge(paper1, paper2, weight, threshold=1)

paperGraph.connected_components_1_degree = paperGraph.findConnectedComponents()

True
790


100%|██████████| 790/790 [00:00<00:00, 2176.07it/s]


In [92]:
paperGraph.getConnection()

eUgS9Ig8JG {'IjMUGuUmBI': 1, 'samyfu6G93': 1, 'WIzzXCVYiH': 1, 'gppLqZLQeY': 1, 'DfPtC8uSot': 1, '43cYe4oogi': 1, 'HSKaGOi7Ar': 1, '2gwo9cjOEz': 1, 'wYvuY60SdD': 1, 'up6hr4hIQH': 1, 'Re5KnZcXhf': 1, 'Zz594UBNOH': 1, 'FPpLTTvzR0': 1, 'AcSChDWL6V': 1}
qBL04XXex6 {}
kmn0BhQk7p {'B0wJ5oCPdB': 1, 'Ebt7JgMHv1': 1, 'Ixi4j6LtdX': 1, 'xw5nxFWMlo': 1, '62K7mALO2q': 1, '4stB7DFLp6': 1, 'pAoqRlTBtY': 1, 'TTEwosByrg': 1, 'Tigr1kMDZy': 1, 'jenyYQzue1': 1, 'caW7LdAALh': 1, 'sNtDKdcI1f': 1, 'vqIH0ObdqL': 1, 'LXVswInHOo': 1, 'hTEGyKf0dZ': 1, 'IEduRUO55F': 1, 'GPKTIktA0k': 1, '7Jwpw4qKkb': 1, 'krx55l2A6G': 1, 'SQrHpTllXa': 1, 'Y3wpuxd7u9': 1, 'ZS4m74kZpH': 1, 'xU0XRbn3b5': 1, '9OevMUdods': 1, '1vrS1zwekw': 1, 'mM7VurbA4r': 1, 'EnXJfQqy0K': 1, 'yRrPfKyJQ2': 1, 'osoWxY8q2E': 1, '1mjsP8RYAw': 1, 'Yol6nUVIJD': 1, 'jjA4O1vJRz': 1, '3d0OmYTNui': 1, 'fibxvahvs3': 1}
i8PjQT3Uig {'5t57omGVMw': 1, 'MVe2dnWPCu': 1}
E5CMyG6jl0 {'AqN23oqraW': 1, 'mIEHIcHGOo': 1, 'tEAF9LBdgu': 1, 'd94x0gWTUX': 1, 'r42tSSCHPh': 2, '9n

In [93]:
connected_components_degree_1 = paperGraph.connected_components_1_degree
# find the largest and smallest connected component
largest_component = max(connected_components_degree_1, key=len)
smallest_component = min(connected_components_degree_1, key=len)
average_component = sum([len(component) for component in connected_components_degree_1]) / len(connected_components_degree_1)
print('largest component size:', len(largest_component))
print('smallest component size:', len(smallest_component))
print('average component size:', average_component)


largest component size: 566
smallest component size: 1
average component size: 405.8658227848101


In [94]:
paperIDs = df.index.values
paperDict_2 = {}
numPapers_2 = len(df)
paperGraph_2 = PaperGraph()

for id in paperIDs:
    title = df.loc[id].title
    link = df.loc[id].link
    keyword_ls = [keyword.strip() for keyword in df.loc[id].keywords.split(',')]
    rating = df.loc[id].rating
    paperDict_2[id] = PaperNode(id, title, link, keyword_ls, rating)
    paperGraph_2.addPaper(id, paperDict_2[id])

for i in tqdm(range(len(paperIDs))):
    for j in range(len(paperIDs)):
        paper1 = paperDict_2[paperIDs[i]]
        paper2 = paperDict_2[paperIDs[j]]
        weight = paper1.calWeight(paper2)
        paperGraph_2.addEdge(paper1, paper2, weight, threshold=2)

paperGraph_2.connected_components_2_degree = paperGraph_2.findConnectedComponents()

connected_components_degree_2 = paperGraph_2.connected_components_2_degree
# find the largest and smallest connected component
largest_component = max(connected_components_degree_2, key=len)
smallest_component = min(connected_components_degree_2, key=len)
average_component = sum([len(component) for component in connected_components_degree_2]) / len(connected_components_degree_2)
print('largest component size:', len(largest_component))
print('smallest component size:', len(smallest_component))
print('average component size:', average_component)

100%|██████████| 790/790 [00:00<00:00, 2257.06it/s]

largest component size: 11
smallest component size: 1
average component size: 1.4177215189873418





In [100]:
# show the largest component in a table
largest_component_paper_ids = list(largest_component)
largest_component_paper_ids.sort()
# how to show it in the terminal?
print(df.loc[largest_component_paper_ids])

                                                        title  \
paper_id                                                        
0akLDTFR9x           Contrastive Difference Predictive Coding   
5liV2xUdJL  Time-Efficient Reinforcement Learning with Sto...   
EpVe8jAjdx  Privileged Sensing Scaffolds Reinforcement Lea...   
TeeyHEi25C  Value function estimation using conditional di...   
Xkf2EBj4w3  Stabilizing Contrastive RL: Techniques for Rob...   
YCWjhGrJFD  Training Diffusion Models with Reinforcement L...   
eJ0dzPJq1F  Blending Imitation and Reinforcement Learning ...   
kNpSUN0uCc  Maximum Entropy Model Correction in Reinforcem...   
o2IEmeLL9r  Pre-Training Goal-based Models for Sample-Effi...   
o4AydSd3Lp  Harnessing Discrete Representations for Contin...   
rvUq3cxpDF                    Learning to Act without Actions   

                                                  link  \
paper_id                                                 
0akLDTFR9x  https://openreview.net/for