# Data Cleaning and Preparation

#### 🐢 Team: Terraria

# Initial Data Collection

The dataset is represented in JSON format and consists of 2694879 records.

In [1]:
import json
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup

In [2]:
df = pl.read_ndjson("arxiv-metadata-oai-snapshot.json", low_memory=True)

## Removing publications before 2015

Firstly, we remove all publications published before 2015, since the authors that haven't published any papers for 10 years will not be interested in co-author recommendations.

In [3]:
print(f"The minimum update date is {df['update_date'].min()}")
print(f"The maxium update date is {df['update_date'].max()}")

The minimum update date is 2007-05-23
The maxium update date is 2025-03-29


In [4]:
df = df.filter(pl.col('update_date') >= '2015-01-01')
df = df.with_columns([
    pl.col('update_date').str.strptime(pl.Datetime, format="%Y-%m-%d").alias('update_date'),
    pl.col('update_date').str.strptime(pl.Datetime, format="%Y-%m-%d").dt.year().alias('year')
])

In [5]:
df.shape

(1921751, 15)

As we can see, 778503 papers were published before 2015.

In [6]:
df.head()

id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed,year
str,str,str,str,str,str,str,str,str,str,str,list[struct[2]],datetime[μs],list[list[str]],i32
"""0704.0006""","""Yue Hin Pong""","""Y. H. Pong and C. K. Law""","""Bosonic characters of atomic C…","""6 pages, 4 figures, accepted b…",,"""10.1103/PhysRevA.75.043613""",,"""cond-mat.mes-hall""",,""" We study the two-particle wa…","[{""v1"",""Sat, 31 Mar 2007 04:24:59 GMT""}]",2015-05-13 00:00:00,"[[""Pong"", ""Y. H."", """"], [""Law"", ""C. K."", """"]]",2015
"""0704.0020""","""Patrick Roudeau""","""The BABAR Collaboration, B. Au…","""Measurement of the Hadronic Fo…","""21 pages, 13 postscript figure…","""Phys.Rev.D76:052005,2007""","""10.1103/PhysRevD.76.052005""","""BABAR-PUB-07/015, SLAC-PUB-124…","""hep-ex""",,""" The shape of the hadronic fo…","[{""v1"",""Sat, 31 Mar 2007 09:49:10 GMT""}]",2015-06-30 00:00:00,"[[""The BABAR Collaboration"", """", """"], [""Aubert"", ""B."", """"]]",2015
"""0704.0025""","""Andrei Mishchenko S""","""A. S. Mishchenko (1 and 2) and…","""Spectroscopic Properties of Po…","""41 pages, 13 figures, in ""Pola…",,"""10.1007/978-1-4020-6348-0_12""",,"""cond-mat.str-el cond-mat.stat-…",,""" We present recent advances i…","[{""v1"",""Mon, 2 Apr 2007 12:02:36 GMT""}]",2015-05-13 00:00:00,"[[""Mishchenko"", ""A. S."", … ""1 and 2""], [""Nagaosa"", ""N."", … ""1 and 3""]]",2015
"""0704.0030""","""Jim Hague""","""J.P.Hague and N.d'Ambrumenil""","""Tuning correlation effects wit…","""Reprint to improve access. 13 …","""J. Low. Temp. Phys. Vol. 140 p…","""10.1007/s10909-005-6013-6""",,"""cond-mat.str-el""",,""" We investigate the effect of…","[{""v1"",""Sat, 31 Mar 2007 14:14:18 GMT""}]",2015-05-13 00:00:00,"[[""Hague"", ""J. P."", """"], [""d'Ambrumenil"", ""N."", """"]]",2015
"""0704.0033""","""Maxim A. Yurkin""","""Maxim A. Yurkin, Valeri P. Mal…","""Convergence of the discrete di…","""23 pages, 5 figures; added sev…","""J.Opt.Soc.Am.A 23, 2578-2591 (…","""10.1364/JOSAA.23.002578 10.136…",,"""physics.optics physics.comp-ph""","""http://creativecommons.org/lic…",""" We performed a rigorous theo…","[{""v1"",""Sat, 31 Mar 2007 15:34:25 GMT""}, {""v2"",""Tue, 29 Mar 2022 18:21:31 GMT""}]",2022-03-31 00:00:00,"[[""Yurkin"", ""Maxim A."", """"], [""Maltsev"", ""Valeri P."", """"], [""Hoekstra"", ""Alfons G."", """"]]",2022


## Removing publications with less than 2 authors

In [7]:
print(f"Number of records before filtering: {len(df)}")

# Filter to keep only papers with 2 or more authors
df = df.filter(pl.col('authors_parsed').list.len() >= 2)

print(f"Number of records after filtering: {len(df)}")

Number of records before filtering: 1921751
Number of records after filtering: 1630081


# Data Description

The dataset consists of 14 features each of object type.

The number of records presented in the dataset is now 1921728.


In [8]:
df.schema

Schema([('id', String),
        ('submitter', String),
        ('authors', String),
        ('title', String),
        ('comments', String),
        ('journal-ref', String),
        ('doi', String),
        ('report-no', String),
        ('categories', String),
        ('license', String),
        ('abstract', String),
        ('versions', List(Struct({'version': String, 'created': String}))),
        ('update_date', Datetime(time_unit='us', time_zone=None)),
        ('authors_parsed', List(List(String))),
        ('year', Int32)])

### Column meaning

| **Column**      | **Description**                                                                                                       |
|-----------------|-----------------------------------------------------------------------------------------------------------------------|
| **id**          | ArXiv ID                                                                            |
| **submitter**   | The name of the person who submitted the paper                                                                                               |
| **authors**     | Authors of the paper                                                                                                  |
| **title**       | Title of the paper                                                                                                    |
| **comments**    | Additional info, such as number of pages and figures                                                                   |
| **journal-ref** | Information about the journal the paper was published in                                                               |
| **doi**         | Digital Object Identifier                                                                      |
| **abstract**    | The abstract of the paper                                                                                             |
| **categories**  | Categories / tags in the ArXiv system                                                                                 |
| **versions**    | A version history                                                                                                     |

For the task of identifying co-authors, the key article attributes are authors, title, abstract, and categories. These features are essential for analyzing papers presented in the dataset and uncovering collaboration patterns. The absence of missing values in these columns confirms the dataset's suitability for this task.

In [9]:
df.null_count()

id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed,year
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,1474,0,0,522555,1209438,925238,1574926,0,29152,0,0,0,0,0


Let's remove the excess columns:

In [10]:
df = df.select(['id', 'authors', 'title', 'doi', 'categories', 'abstract', 'update_date', 'authors_parsed', 'year'])
df.head(5)

id,authors,title,doi,categories,abstract,update_date,authors_parsed,year
str,str,str,str,str,str,datetime[μs],list[list[str]],i32
"""0704.0006""","""Y. H. Pong and C. K. Law""","""Bosonic characters of atomic C…","""10.1103/PhysRevA.75.043613""","""cond-mat.mes-hall""",""" We study the two-particle wa…",2015-05-13 00:00:00,"[[""Pong"", ""Y. H."", """"], [""Law"", ""C. K."", """"]]",2015
"""0704.0020""","""The BABAR Collaboration, B. Au…","""Measurement of the Hadronic Fo…","""10.1103/PhysRevD.76.052005""","""hep-ex""",""" The shape of the hadronic fo…",2015-06-30 00:00:00,"[[""The BABAR Collaboration"", """", """"], [""Aubert"", ""B."", """"]]",2015
"""0704.0025""","""A. S. Mishchenko (1 and 2) and…","""Spectroscopic Properties of Po…","""10.1007/978-1-4020-6348-0_12""","""cond-mat.str-el cond-mat.stat-…",""" We present recent advances i…",2015-05-13 00:00:00,"[[""Mishchenko"", ""A. S."", … ""1 and 2""], [""Nagaosa"", ""N."", … ""1 and 3""]]",2015
"""0704.0030""","""J.P.Hague and N.d'Ambrumenil""","""Tuning correlation effects wit…","""10.1007/s10909-005-6013-6""","""cond-mat.str-el""",""" We investigate the effect of…",2015-05-13 00:00:00,"[[""Hague"", ""J. P."", """"], [""d'Ambrumenil"", ""N."", """"]]",2015
"""0704.0033""","""Maxim A. Yurkin, Valeri P. Mal…","""Convergence of the discrete di…","""10.1364/JOSAA.23.002578 10.136…","""physics.optics physics.comp-ph""",""" We performed a rigorous theo…",2022-03-31 00:00:00,"[[""Yurkin"", ""Maxim A."", """"], [""Maltsev"", ""Valeri P."", """"], [""Hoekstra"", ""Alfons G."", """"]]",2022


## Remove Duplicates
The number of unique ids is 2689065 is less than the number of records 2689088. Therefore, there are duplicates that must be cleaned.

In [11]:
num_of_unique_id = df['id'].n_unique()
print(f"Number of unique ids: {num_of_unique_id}")

Number of unique ids: 1630074


In [12]:
print("Number of ids before cleaning:", df['id'].shape[0])

Number of ids before cleaning: 1630081


In [13]:
df = df.unique(subset=['id'])
print(f"Number of records after deduplication: {len(df)}")

Number of records after deduplication: 1630074


## Remove not connected authors

In [14]:
import networkx as nx
from tqdm import tqdm
from itertools import combinations

G = nx.Graph()

# For each paper, create edges between all pairs of authors
for authors_list, paper_id in tqdm(zip(df['authors_parsed'], df['id']), total=len(df)):
    authors = [' '.join(author) for author in authors_list]
    
    # Add paper ids to each author's papers list
    for author in authors:
        if not G.has_node(author):
            G.add_node(author, papers=[paper_id])
        else:
            G.nodes[author]['papers'].append(paper_id)
    
    # Create edges between all pairs of authors
    for author1, author2 in combinations(authors, 2):
        if G.has_edge(author1, author2):
            G[author1][author2]['weight'] += 1
        else:
            G.add_edge(author1, author2, weight=1)

100%|██████████| 1630074/1630074 [07:29<00:00, 3630.25it/s] 


In [15]:
Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
G0 = G.subgraph(Gcc[0])

In [16]:
print(f"Number of nodes (authors) in all graphs: {G.number_of_nodes()}")
print(f"Number of edges (co-authorship connections) in all graphs: {G.number_of_edges()}")
print(f"Number of nodes (authors) in largest graph: {G0.number_of_nodes()}")
print(f"Number of edges (co-authorship connections) in largest graph: {G0.number_of_edges()}")

Number of nodes (authors) in all graphs: 1758247
Number of edges (co-authorship connections) in all graphs: 77561791
Number of nodes (authors) in largest graph: 1538924
Number of edges (co-authorship connections) in largest graph: 68483914


In [17]:
# Get all paper ids from the largest connected component
papers_in_g0 = set()
for author in G0.nodes():
    papers_in_g0.update(G0.nodes[author]['papers'])

print(f"Number of papers in largest connected component: {len(papers_in_g0)}")

Number of papers in largest connected component: 1566986


This way we get only the nodes connected to the main biggest graph, getting read of authors who exist in closed societies.

In [18]:
df = df.filter(pl.col('id').is_in(papers_in_g0))
print(f"Number of papers after filtering to largest connected component: {len(df)}")

Number of papers after filtering to largest connected component: 1566986


## Categories

### Getting category names from [Category Taxonomy page](https://arxiv.org/category_taxonomy)

Since categories are represented as codes, we retrieved the category names from the official arXiv Category Taxonomy page and mapped the codes to the categoty names.

In [18]:
response = requests.get("https://arxiv.org/category_taxonomy")

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    taxonomy_div = soup.find('div', id='category_taxonomy_list')
    category_code_2_name = {}
    
    # h2 tags represent father categories
    next_categories = taxonomy_div.find_all(['h2', 'h3'], class_=['accordion-head', 'column is-one-fifth'])


    # Iterate over each father category
    for next_category in next_categories:
        # get category name
        next_category_name = next_category.text.strip()
        
        # find sub-categories
        sub_categories = next_category.find_next_sibling().find_all('h4')
        for sub_category in sub_categories:
            # get sub-category name
            sub_category_name = sub_category.text.strip()
            # get sub-category code
            sub_category_code = sub_category_name.split()[0]
            
            # append the retrieved data
            category_code_2_name[sub_category_code] = next_category_name
    print('Extracted categories')
    print('Number of extracted categories', len(category_code_2_name))
    print(category_code_2_name)
else:
    print(f"Error getting data {response.status_code}")

Extracted categories
Number of extracted categories 155
{'cs.AI': 'Computer Science', 'cs.AR': 'Computer Science', 'cs.CC': 'Computer Science', 'cs.CE': 'Computer Science', 'cs.CG': 'Computer Science', 'cs.CL': 'Computer Science', 'cs.CR': 'Computer Science', 'cs.CV': 'Computer Science', 'cs.CY': 'Computer Science', 'cs.DB': 'Computer Science', 'cs.DC': 'Computer Science', 'cs.DL': 'Computer Science', 'cs.DM': 'Computer Science', 'cs.DS': 'Computer Science', 'cs.ET': 'Computer Science', 'cs.FL': 'Computer Science', 'cs.GL': 'Computer Science', 'cs.GR': 'Computer Science', 'cs.GT': 'Computer Science', 'cs.HC': 'Computer Science', 'cs.IR': 'Computer Science', 'cs.IT': 'Computer Science', 'cs.LG': 'Computer Science', 'cs.LO': 'Computer Science', 'cs.MA': 'Computer Science', 'cs.MM': 'Computer Science', 'cs.MS': 'Computer Science', 'cs.NA': 'Computer Science', 'cs.NE': 'Computer Science', 'cs.NI': 'Computer Science', 'cs.OH': 'Computer Science', 'cs.OS': 'Computer Science', 'cs.PF': 'Compu

#### Analyzing splitted categories

In the dataset, some papers are assigned multiple categories in a single cell. To understand the frequency of each category, we split them for separate analysis. Then, we examined the original multi-category assignments to understand how categories are grouped in papers.

In [19]:
df_exploded = df.select([
    pl.all(),
    pl.col('categories').str.split(' ').alias('splitted_categories')
]).explode('splitted_categories')
df_exploded.head()

id,authors,title,doi,categories,abstract,update_date,authors_parsed,year,splitted_categories
str,str,str,str,str,str,datetime[μs],list[list[str]],i32,str
"""2308.07367""","""Pavel Fileviez Perez, Clara Mu…","""Finite Naturalness and Quark-L…","""10.1103/PhysRevD.109.015011""","""hep-ph""",""" We study the implications of…",2024-01-19 00:00:00,"[[""Perez"", ""Pavel Fileviez"", """"], [""Murgui"", ""Clara"", """"], … [""Wise"", ""Mark B."", """"]]",2024,"""hep-ph"""
"""1504.03756""","""Gabriel Bujokas and Anand Pate…","""Invariants of a general branch…",,"""math.AG""",""" We investigate the resolutio…",2015-04-16 00:00:00,"[[""Bujokas"", ""Gabriel"", """"], [""Patel"", ""Anand"", """"]]",2015,"""math.AG"""
"""1606.07233""","""Per-Arne Andersen, Christian K…","""Adaptive Task Assignment in On…",,"""cs.AI""",""" With the increasing populari…",2016-06-24 00:00:00,"[[""Andersen"", ""Per-Arne"", """"], [""Kråkevik"", ""Christian"", """"], … [""Yazidi"", ""Anis"", """"]]",2016,"""cs.AI"""
"""1711.08027""","""E. M. Inack, G. Giudici, T. Pa…","""Understanding Quantum Tunnelin…","""10.1103/PhysRevA.97.032307""","""cond-mat.stat-mech quant-ph""",""" In simple ferromagnetic quan…",2018-03-13 00:00:00,"[[""Inack"", ""E. M."", """"], [""Giudici"", ""G."", """"], … [""Pilati"", ""S."", """"]]",2018,"""cond-mat.stat-mech"""
"""1711.08027""","""E. M. Inack, G. Giudici, T. Pa…","""Understanding Quantum Tunnelin…","""10.1103/PhysRevA.97.032307""","""cond-mat.stat-mech quant-ph""",""" In simple ferromagnetic quan…",2018-03-13 00:00:00,"[[""Inack"", ""E. M."", """"], [""Giudici"", ""G."", """"], … [""Pilati"", ""S."", """"]]",2018,"""quant-ph"""


In [20]:
# map category code to category name
df_exploded = df_exploded.with_columns([
    pl.col('splitted_categories').map_elements(lambda x: category_code_2_name.get(x, 'Not-defined')).alias('category_name')
])
df_exploded.head()

  df_exploded = df_exploded.with_columns([


id,authors,title,doi,categories,abstract,update_date,authors_parsed,year,splitted_categories,category_name
str,str,str,str,str,str,datetime[μs],list[list[str]],i32,str,str
"""2308.07367""","""Pavel Fileviez Perez, Clara Mu…","""Finite Naturalness and Quark-L…","""10.1103/PhysRevD.109.015011""","""hep-ph""",""" We study the implications of…",2024-01-19 00:00:00,"[[""Perez"", ""Pavel Fileviez"", """"], [""Murgui"", ""Clara"", """"], … [""Wise"", ""Mark B."", """"]]",2024,"""hep-ph""","""Physics"""
"""1504.03756""","""Gabriel Bujokas and Anand Pate…","""Invariants of a general branch…",,"""math.AG""",""" We investigate the resolutio…",2015-04-16 00:00:00,"[[""Bujokas"", ""Gabriel"", """"], [""Patel"", ""Anand"", """"]]",2015,"""math.AG""","""Mathematics"""
"""1606.07233""","""Per-Arne Andersen, Christian K…","""Adaptive Task Assignment in On…",,"""cs.AI""",""" With the increasing populari…",2016-06-24 00:00:00,"[[""Andersen"", ""Per-Arne"", """"], [""Kråkevik"", ""Christian"", """"], … [""Yazidi"", ""Anis"", """"]]",2016,"""cs.AI""","""Computer Science"""
"""1711.08027""","""E. M. Inack, G. Giudici, T. Pa…","""Understanding Quantum Tunnelin…","""10.1103/PhysRevA.97.032307""","""cond-mat.stat-mech quant-ph""",""" In simple ferromagnetic quan…",2018-03-13 00:00:00,"[[""Inack"", ""E. M."", """"], [""Giudici"", ""G."", """"], … [""Pilati"", ""S."", """"]]",2018,"""cond-mat.stat-mech""","""Physics"""
"""1711.08027""","""E. M. Inack, G. Giudici, T. Pa…","""Understanding Quantum Tunnelin…","""10.1103/PhysRevA.97.032307""","""cond-mat.stat-mech quant-ph""",""" In simple ferromagnetic quan…",2018-03-13 00:00:00,"[[""Inack"", ""E. M."", """"], [""Giudici"", ""G."", """"], … [""Pilati"", ""S."", """"]]",2018,"""quant-ph""","""Physics"""


Now we create a parquet partition fof the original dataset by main categories:

In [21]:
# unique_categories = df_exploded['category_name'].unique()

# for category in unique_categories:
#     (df_exploded
#      .filter(pl.col('category_name') == category)
#      .select(['id', 'authors', 'title', 'doi', 'categories', 'abstract', 'update_date', 'year'])
#      .write_parquet(f'category_{category.replace(" ", "_")}.parquet')
#     )

# print(f"Successfully split and saved {len(unique_categories)} category files")

In [22]:
# df_exploded.write_parquet('arxiv_cleaned.parquet')

In [20]:
import numpy as np

# print("Features of the whole graph:")
# print(f"Number of nodes (authors) in all graphs: {G.number_of_nodes()}")
# print(f"Number of edges (co-authorship connections) in all graphs: {G.number_of_edges()}")
# print(f"Density of all graphs: {nx.density(G)}")
# print(f"Mean betweenness centrality of all graphs: {np.mean(nx.betweenness_centrality(G))}")
# print(f"Mean degree centrality of all graphs: {np.mean(nx.degree_centrality(G))}")
# print(f"Mean eigenvector centrality of all graphs: {np.mean(nx.eigenvector_centrality(G))}")
# print(f"Mean closeness centrality of all graphs: {np.mean(nx.closeness_centrality(G))}")
# print(f"Number of connected components in all graphs: {nx.number_connected_components(G)}")
# print(f"Mean clustering coefficient of all graphs: {nx.clustering(G)}")
# plt.bar(*np.unique(sorted((d for n, d in G.degree()), reverse=True), return_counts=True))
# plt.xlabel("Degree")
# plt.ylabel("Number of nodes")
# plt.title("Histogram of node degrees for all graphs")
# plt.show()

print("\n\nFeatures of the largest subgraph:")
print(f"Number of nodes (authors) in largest graph: {G0.number_of_nodes()}")
print(f"Number of edges (co-authorship connections) in largest graph: {G0.number_of_edges()}")
# print(f"Diameter in largest graph: {nx.diameter(G0)}")
# print(f"Average shortest path length: {nx.average_shortest_path_length(G0)}")
print(f"Density in largest graph: {nx.density(G0)}")
# print(f"Mean betweenness centrality of largest graph: {np.mean(nx.betweenness_centrality(G0))}")
# print(f"Mean degree centrality of largest graph: {np.mean(nx.degree_centrality(G0))}")
# print(f"Mean eigenvector centrality of largest graph: {nx.eigenvector_centrality(G0, max_iter=10)}")
# print(f"Mean closeness centrality of largest graph: {np.mean(nx.closeness_centrality(G0))}")
# print(f"Mean clustering coefficient of largest graph: {nx.clustering(G0)}")
# plt.bar(*np.unique(sorted((d for n, d in G0.degree()), reverse=True), return_counts=True))
# plt.xlabel("Degree")
# plt.ylabel("Number of nodes")
# plt.title("Histogram of node degrees for largest subgraph")
# plt.show()



Features of the largest subgraph:
Number of nodes (authors) in largest graph: 1538924
Number of edges (co-authorship connections) in largest graph: 68483914
Density in largest graph: 5.7834169804707596e-05


In [None]:
eig_centrality = nx.eigenvector_centrality(G0, max_iter=10)

KeyError: 0

In [30]:
# Get the top 10 authors with the highest eigenvector centrality
top_10_authors = sorted(eig_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
top_10_authors

[('Zhang L. ', 0.03277806516111425),
 ('Zhang J. ', 0.031437605241031036),
 ('Sun L. ', 0.03140374996886515),
 ('Wang J. ', 0.03093541072406477),
 ('Zhang Y. ', 0.0296056130821164),
 ('Chen Y. ', 0.02906579415221667),
 ('van Beuzekom M. ', 0.028405027742595703),
 ('Mitselmakher G. ', 0.027244281751351615),
 ('Walker M. ', 0.027127816367376104),
 ('Kumar A. ', 0.027047812766354595)]

In [24]:
degree_distribution = np.unique(sorted((d for n, d in G0.degree()), reverse=True), return_counts=True)

In [25]:
import pandas as pd
degree_distribution_df = pd.DataFrame(degree_distribution).T
degree_distribution_df.columns = ['degree', 'count']

In [27]:
import plotly.express as px

fig = px.histogram(degree_distribution_df, x='degree', y='count',
             title="Histogram of node degrees for largest subgraph")
fig.show()

In [None]:
fig = px.box(degree_distribution_df["degree"],
             title="")
fig.show()

In [None]:
fig = px.box(sorted((d for n, d in G0.degree()), reverse=True),
             title="Boxplot of node degrees for largest subgraph")
fig.show()