In [None]:
!pip install xmltodict
!pip install networkx
!pip install networkit
!pip install matplotlib
!pip install -U imbalanced-learn

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import xmltodict
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import networkit as nk
import torch
import numpy as np
import ast

In [None]:
nk.setNumberOfThreads(1)

## **1. Curating the Dataset from DBLP**

*   First, the DBLP XML parsed using *xmltodict* and only extract publications whose year is within 2019-2025.
*   Returns a list of publication dictionaries containing **authors**, **year**, **title** and **venue**.
*   Then, the data is loaded into a *DataFrame* and preprocessed.

* Lastly, a surrogate success label is created based on **venue**.
  * If a paper's venue is in the predefined top-tier venues set, label it as 1 else 0.

</br>


In [None]:
def parse_dblp(file_path, min_year=2019, max_year=2025):
    with open(file_path, 'r', encoding='utf-8') as f:
        dblp_dict = xmltodict.parse(f.read())

    publications = []
    pub_types = ['article', 'inproceedings', 'proceedings', 'book', 'incollection']
    for pub_type in pub_types:
        pubs = dblp_dict['dblp'].get(pub_type, [])
        if not isinstance(pubs, list):
            pubs = [pubs]
        for pub in pubs:
            year = pub.get('year')
            year = int(year) if year and year.isdigit() else None
            if year is None or year < 2019 or year > 2025:
                continue

            raw_authors = pub.get('author', [])
            authors_list = []
            if isinstance(raw_authors, list):
                for a in raw_authors:
                    if isinstance(a, dict):
                        authors_list.append(a.get('#text', ''))
                    else:
                        authors_list.append(a)
            elif isinstance(raw_authors, dict):
                authors_list = [raw_authors.get('#text', '')]
            elif isinstance(raw_authors, str):
                authors_list = [raw_authors]
            else:
                authors_list = []

            venue = pub.get('journal') or pub.get('booktitle')
            if isinstance(venue, dict):
                venue = venue.get('#text', '')

            title = pub.get('title', '')
            if isinstance(title, dict):
                title = title.get('#text', '')

            publications.append({
                "authors": authors_list,
                "year": year,
                "title": title,
                "venue": venue
            })

    return publications

In [None]:
dblp_file = "/content/drive/MyDrive/dblp.xml"  # Path to DBLP XML file

# Parse DBLP dataset
print("Parsing DBLP...")
publications = parse_dblp(dblp_file, min_year=2020, max_year=2025)
print(f"Total parsed publications: {len(publications)}")

In [None]:
# Limit dataset size
np.random.seed(42)
if len(publications) > 100000:
    publications = np.random.choice(publications, size=100000, replace=False).tolist()
    print(f"Sampled down to {len(publications)} publications.")

# Create DataFrame
df = pd.DataFrame(publications)
df.head(10)

Unnamed: 0,authors,year,title,venue
0,"[Yuanhong Chen, Yu Tian 0001, Guansong Pang, G...",2022,Deep One-Class Classification via Interpolated...,AAAI
1,"[Alexandros Vrochidis, Vasileios G. Vasilopoul...",2021,A Recommendation Specific Human Activity Recog...,AIAI Workshops
2,"[Uppugunduri Vijay Nikhil, Athiya M. Pandiyan,...",2024,Machine Learning-Based Crop Yield Prediction i...,Comput.
3,[],2020,26th IEEE International Symposium on On-Line T...,IOLTS
4,"[Xiangyu Chen, Ruiwen Zhen, Shuai Li, Xiaotian...",2023,MOFA: A Model Simplification Roadmap for Image...,ICCV (Workshops)
5,"[Jinchang Zhang, Guoyu Lu]",2024,Underground Mapping and Localization Based on ...,CoRR
6,"[Aviv Yaish, Maya Dotan, Kaihua Qin, Aviv Zoha...",2023,Suboptimality in DeFi.,IACR Cryptol. ePrint Arch.
7,"[Xuemei Bai, Xiaoqing Gu]",2021,"Group Differences of Teaching Presence, Social...",Int. J. Distance Educ. Technol.
8,"[Hisham Alhulayyil, Kittipat Apicharttrisorn, ...",2020,WOLT: Auto-Configuration of Integrated Enterpr...,ICDCS
9,"[Amir Ghiasi-Noughaby, Pouya Amiri, Robert E. ...",2024,Identification of the Human Postural Sway Resp...,EMBC


In [None]:
# Preprocess the dataset:
# - Remove rows with missing required fields.
# - Remove entries with empty author lists.
# - Remove duplicate publications.
# - Standardize venue names.
df = df.dropna(subset=['authors', 'venue', 'year']).copy()
df = df[df['authors'].apply(lambda authors: len(authors) > 0)].copy()
df = df.drop_duplicates(subset=['title', 'year']).copy()
df['venue'] = df['venue'].apply(lambda v: v.strip().upper())

print("Df length after preprocessing:", len(df))
df.head(10)

DataFrame length after preprocessing: 98685


Unnamed: 0,authors,year,title,venue
0,"[Yuanhong Chen, Yu Tian 0001, Guansong Pang, G...",2022,Deep One-Class Classification via Interpolated...,AAAI
1,"[Alexandros Vrochidis, Vasileios G. Vasilopoul...",2021,A Recommendation Specific Human Activity Recog...,AIAI WORKSHOPS
2,"[Uppugunduri Vijay Nikhil, Athiya M. Pandiyan,...",2024,Machine Learning-Based Crop Yield Prediction i...,COMPUT.
4,"[Xiangyu Chen, Ruiwen Zhen, Shuai Li, Xiaotian...",2023,MOFA: A Model Simplification Roadmap for Image...,ICCV (WORKSHOPS)
5,"[Jinchang Zhang, Guoyu Lu]",2024,Underground Mapping and Localization Based on ...,CORR
6,"[Aviv Yaish, Maya Dotan, Kaihua Qin, Aviv Zoha...",2023,Suboptimality in DeFi.,IACR CRYPTOL. EPRINT ARCH.
7,"[Xuemei Bai, Xiaoqing Gu]",2021,"Group Differences of Teaching Presence, Social...",INT. J. DISTANCE EDUC. TECHNOL.
8,"[Hisham Alhulayyil, Kittipat Apicharttrisorn, ...",2020,WOLT: Auto-Configuration of Integrated Enterpr...,ICDCS
9,"[Amir Ghiasi-Noughaby, Pouya Amiri, Robert E. ...",2024,Identification of the Human Postural Sway Resp...,EMBC
10,"[Braslav Rabar, Maja Zagorscak, Strahil Ristov...",2019,IGLOSS: iterative gapless local similarity sea...,BIOINFORM.


In [None]:
# Define top-tier venues and assign a surrogate success label (1 if top-tier, 0 otherwise)
TOP_TIER_VENUES = {"ICLR", "ICML", "NEURIPS", "SIGCOMM", "IEEE ACCESS", "CORR", "CVPR"} # Can be changed for further evaluation!
df["success"] = df["venue"].apply(lambda v: 1 if v in TOP_TIER_VENUES else 0)

# Display success label distribution
df["success"].value_counts()

Unnamed: 0_level_0,count
success,Unnamed: 1_level_1
0,76442
1,22243


In [None]:
df[['authors', 'success']].head(10)

Unnamed: 0,authors,success
0,"[Yuanhong Chen, Yu Tian 0001, Guansong Pang, G...",0
1,"[Alexandros Vrochidis, Vasileios G. Vasilopoul...",0
2,"[Uppugunduri Vijay Nikhil, Athiya M. Pandiyan,...",0
4,"[Xiangyu Chen, Ruiwen Zhen, Shuai Li, Xiaotian...",0
5,"[Jinchang Zhang, Guoyu Lu]",1
6,"[Aviv Yaish, Maya Dotan, Kaihua Qin, Aviv Zoha...",0
7,"[Xuemei Bai, Xiaoqing Gu]",0
8,"[Hisham Alhulayyil, Kittipat Apicharttrisorn, ...",0
9,"[Amir Ghiasi-Noughaby, Pouya Amiri, Robert E. ...",0
10,"[Braslav Rabar, Maja Zagorscak, Strahil Ristov...",0


In [None]:
size = 100 # data subset
df.to_csv(f"publications_{size}K.csv", index=False)

## **2. Building a Time-Resolved Coauthorship Network**
* We build a **coauthorship network** for publications within the time window 2019-2025.
    * Nodes represent authors. Edge between two authors is created if they coauthored a paper, with the edge weight being the number of collaborations.
    * Each window has a fixed size of 2 years and is shifted.

* Then, we compute network **centrality measures** such as betweenness centrality and k-core centrality for each author in these time slices.

In [None]:
def build_coauthorship_network(publications, start_year, end_year):
    G = nx.Graph()
    for pub in publications:
        if start_year <= pub["year"] <= end_year:
            authors = pub["authors"]
            for i, author in enumerate(authors):
                G.add_node(author)
                for other in authors[i+1:]:
                    if G.has_edge(author, other):
                        G[author][other]["weight"] += 1
                    else:
                        G.add_edge(author, other, weight=1)
    return G

In [None]:
def compute_centralities(G):
    G_int = nx.convert_node_labels_to_integers(G)
    inv_mapping = {i: node for i, node in enumerate(G.nodes())}

    G_nk = nk.nxadapter.nx2nk(G_int)
    n = G_nk.numberOfNodes()

    btw_alg = nk.centrality.ApproxBetweenness(G_nk, epsilon=0.5, delta=0.1, universalConstant=1.0) # Parameters can be changed for further evaluation!
    btw_alg.run()
    btw_scores = btw_alg.scores()

    core_alg = nk.centrality.CoreDecomposition(G_nk)
    core_alg.run()
    cover = core_alg.getCover()
    subset_ids = cover.getSubsetIds()
    core_nums = [0] * n
    for s in sorted(subset_ids):
        members = cover.getMembers(s)
        for node in members:
            core_nums[node] = s

    # Mapping back scores to author names
    betweenness = {inv_mapping[i]: btw_scores[i] for i in range(n)}
    kcore = {inv_mapping[i]: core_nums[i] for i in range(n)}

    return {"max_betweenness": betweenness, "max_kcore": kcore}

In [None]:
centralities_dict = {}

# Loop through time windows (2-year windows)
for t in range(2020, 2026):
    window_start, window_end = t - 2, t - 1

    if window_start < 2020:
        print(f"Skipping window {window_start}-{window_end}.")
        continue

    G_window = build_coauthorship_network(publications, window_start, window_end)
    G_window.remove_edges_from(nx.selfloop_edges(G_window))

    if G_window.number_of_nodes() == 0:
        print(f"No authors found in window {window_start}-{window_end}, skipping.")
        continue

    # Keep only the largest connected component
    largest_cc = max(nx.connected_components(G_window), key=len)
    G_window = G_window.subgraph(largest_cc).copy()

    print(f"Computing centralities for {window_start}-{window_end} (Nodes: {G_window.number_of_nodes()}, Edges: {G_window.number_of_edges()})...")
    centralities = compute_centralities(G_window)

    # This is for each publication in the current year, recording the maximum centrality values
    centralities_summary = []
    for pub in publications:
        if pub["year"] == t:
            authors = pub["authors"]
            pub_centralities = {}
            for feat in centralities:
                # Use the maximum centrality
                pub_centralities[feat] = max(centralities[feat].get(a, 0) for a in authors) if authors else 0
            centralities_summary.append(pub_centralities)

    # Save a sample of centrality summaries
    print(f"Centralities computed for window {window_start}-{window_end}:")
    print(centralities_summary[:5])

Skipping window 2018-2019.
Skipping window 2019-2020.
Computing centralities for 2020-2021 (Nodes: 26354, Edges: 91838)...
Centralities computed for window 2020-2021:
[{'max_betweenness': 0, 'max_kcore': 0}, {'max_betweenness': 0.0, 'max_kcore': 3}, {'max_betweenness': 0, 'max_kcore': 0}, {'max_betweenness': 0, 'max_kcore': 0}, {'max_betweenness': 0, 'max_kcore': 2}]
Computing centralities for 2021-2022 (Nodes: 33449, Edges: 119810)...
Centralities computed for window 2021-2022:
[{'max_betweenness': 0.0, 'max_kcore': 8}, {'max_betweenness': 0, 'max_kcore': 0}, {'max_betweenness': 0, 'max_kcore': 3}, {'max_betweenness': 0, 'max_kcore': 0}, {'max_betweenness': 0, 'max_kcore': 0}]
Computing centralities for 2022-2023 (Nodes: 43931, Edges: 183223)...
Centralities computed for window 2022-2023:
[{'max_betweenness': 0, 'max_kcore': 13}, {'max_betweenness': 0, 'max_kcore': 0}, {'max_betweenness': 0, 'max_kcore': 0}, {'max_betweenness': 0, 'max_kcore': 0}, {'max_betweenness': 0, 'max_kcore': 0

In [None]:
size = 100 # sub dataset size

df_centralities = pd.DataFrame({
    "author": list(centralities["max_betweenness"].keys()),
    "max_betweenness": list(centralities["max_betweenness"].values()),
    "max_kcore": list(centralities["max_kcore"].values())
})

df_centralities.to_csv(f"centralities_{size}K.csv", index=False)

In [None]:
df = pd.read_csv("publications_100K.csv")
df_centralities = pd.read_csv("centralities_1OOK.csv")

import ast
if isinstance(df["authors"].iloc[0], str):
    df["authors"] = df["authors"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

df_exploded = df.explode("authors")
df_merged = df_exploded.merge(df_centralities, left_on="authors", right_on="author", how="left")

#Fill missing centrality values with 0
df_merged["max_betweenness"].fillna(0, inplace=True)
df_merged["max_kcore"].fillna(0, inplace=True)

df_final = df_merged.groupby(["title", "year"]).agg({
    "venue": "first",
    "success": "first",
    "authors": lambda x: list(x),  # For error in authors
    "max_betweenness": "max",
    "max_kcore": "max"
}).reset_index()

df_final.to_csv("merged_data_1OOK.csv", index=False)
print("\nFinal Merged Dataset Saved Successfully!")

df_final.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_merged["max_betweenness"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_merged["max_kcore"].fillna(0, inplace=True)



Final Merged Dataset Saved Successfully!


Unnamed: 0,title,year,venue,success,authors,max_betweenness,max_kcore
0,"""Again, Dozens of Refugees Drowned"": A Computa...",2022,NAACL-HLT (STUDENT RESEARCH WORKSHOP),0,[Qi Yu 0007],0.0,0.0
1,"""Am I listening?"", Evaluating the Quality of G...",2023,ICMI COMPANION,0,"[Pieter Wolfert, Gustav Eje Henter, Tony Belpa...",0.0,9.0
2,"""Anti-space"" as a New Approach to Museum desig...",2022,HCI (45),0,[Haoli Huang],0.0,0.0
3,"""Appropriate Technical and Organizational Meas...",2019,AMCIS,0,"[Dominik Huth, Florian Matthes]",0.0,0.0
4,"""Are Crowdsourcing Platforms Reliable for Vide...",2024,CHI PLAY (COMPANION),0,"[Linus Eisele, Giovanni Apruzzese]",0.0,0.0


## **3. Prediction and Evaluation**

* **Baseline Prediction Model:** Use the aggregated network centrality features (max_betweenness, max_kcore) and the surrogate success label as inputs and target variable. The evaluation is done using **classification_report.**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load Merged Dataset
df = pd.read_csv("merged_data_1M.csv")
df.dropna(subset=["max_betweenness", "max_kcore", "success"], inplace=True)

# Define feature and target
features = ["max_betweenness", "max_kcore"]
target = "success"

# 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target],
                                                    test_size=0.2, stratify=df[target],
                                                    random_state=42)

print(f"\nTraining on {len(X_train)} samples, testing on {len(X_test)} samples.")

# Train Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

* **Deep Learning Approach:** MLP to capture more nuanced network features. Again evaluation is done using **classification_report**.

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Load Merged Dataset
df = pd.read_csv("merged_data_1M.csv")
df.dropna(subset=["max_betweenness", "max_kcore", "success"], inplace=True)

# Define Features & Target
features = ["max_betweenness", "max_kcore"]
target = "success"

# 80% Train, 20% Test
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target],
                                                    test_size=0.2, stratify=df[target],
                                                    random_state=42)

print(f"\nTraining on {len(X_train)} samples, testing on {len(X_test)} samples.")

# Apply StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print(f"\nResampled Training Data: {len(X_train_resampled)} samples (was {len(X_train)})")

mlp = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam',
                    max_iter=200, random_state=42, early_stopping=True)

mlp.fit(X_train_resampled, y_train_resampled)

y_pred = mlp.predict(X_test_scaled)

print("\nMLP Classification Report:")
print(classification_report(y_test, y_pred))


Training on 78948 samples, testing on 19737 samples.

Resampled Training Data: 122308 samples (was 78948)

MLP Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.66      0.73     15288
           1       0.29      0.47      0.36      4449

    accuracy                           0.62     19737
   macro avg       0.55      0.56      0.54     19737
weighted avg       0.69      0.62      0.64     19737



**NOTE:** MLP Classification report in this notebook is based on **100K data**, further evaluations *(200K, 500K, 1M)* are included in the report.