In [1]:
arxiv_categories = [
    'astro-ph.CO',
    'astro-ph.EP',
    'astro-ph.GA',
    'astro-ph.HE',
    'astro-ph.IM',
    'astro-ph.SR',
    'cond-mat.dis-nn',
    'cond-mat.mes-hall',
    'cond-mat.mtrl-sci',
    'cond-mat.other',
    'cond-mat.quant-gas',
    'cond-mat.soft',
    'cond-mat.stat-mech',
    'cond-mat.str-el',
    'cond-mat.supr-con',
    'cs.AI',
    'cs.AR',
    'cs.CC',
    'cs.CE',
    'cs.CG',
    'cs.CL',
    'cs.CR',
    'cs.CV',
    'cs.CY',
    'cs.DB',
    'cs.DC',
    'cs.DL',
    'cs.DM',
    'cs.DS',
    'cs.ET',
    'cs.FL',
    'cs.GL',
    'cs.GR',
    'cs.GT',
    'cs.HC',
    'cs.IR',
    'cs.IT',
    'cs.LG',
    'cs.LO',
    'cs.MA',
    'cs.MM',
    'cs.MS',
    'cs.NA',
    'cs.NE',
    'cs.NI',
    'cs.OH',
    'cs.OS',
    'cs.PF',
    'cs.PL',
    'cs.RO',
    'cs.SC',
    'cs.SD',
    'cs.SE',
    'cs.SI',
    'cs.SY',
    'econ.EM',
    'econ.GN',
    'econ.TH',
    'eess.AS',
    'eess.IV',
    'eess.SP',
    'eess.SY',
    'gr-qc',
    'hep-ex',
    'hep-lat',
    'hep-ph',
    'hep-th',
    'math-ph',
    'math.AC',
    'math.AG',
    'math.AP',
    'math.AT',
    'math.CA',
    'math.CO',
    'math.CT',
    'math.CV',
    'math.DG',
    'math.DS',
    'math.FA',
    'math.GM',
    'math.GN',
    'math.GR',
    'math.GT',
    'math.HO',
    'math.IT',
    'math.KT',
    'math.LO',
    'math.MG',
    'math.MP',
    'math.NA',
    'math.NT',
    'math.OA',
    'math.OC',
    'math.PR',
    'math.QA',
    'math.RA',
    'math.RT',
    'math.SG',
    'math.SP',
    'math.ST',
    'nlin.AO',
    'nlin.CD',
    'nlin.CG',
    'nlin.PS',
    'nlin.SI',
    'nucl-ex',
    'nucl-th',
    'physics.acc-ph',
    'physics.ao-ph',
    'physics.app-ph',
    'physics.atm-clus',
    'physics.atom-ph',
    'physics.bio-ph',
    'physics.chem-ph',
    'physics.class-ph',
    'physics.comp-ph',
    'physics.data-an',
    'physics.ed-ph',
    'physics.flu-dyn',
    'physics.gen-ph',
    'physics.geo-ph',
    'physics.hist-ph',
    'physics.ins-det',
    'physics.med-ph',
    'physics.optics',
    'physics.plasm-ph',
    'physics.pop-ph',
    'physics.soc-ph',
    'physics.space-ph',
    'q-bio.BM',
    'q-bio.CB',
    'q-bio.GN',
    'q-bio.MN',
    'q-bio.NC',
    'q-bio.OT',
    'q-bio.PE',
    'q-bio.QM',
    'q-bio.SC',
    'q-bio.TO',
    'q-fin.CP',
    'q-fin.EC',
    'q-fin.GN',
    'q-fin.MF',
    'q-fin.PM',
    'q-fin.PR',
    'q-fin.RM',
    'q-fin.ST',
    'q-fin.TR',
    'quant-ph',
    'stat.AP',
    'stat.CO',
    'stat.ME',
    'stat.ML',
    'stat.OT',
    'stat.TH'
]

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
import json

data_file = '../data/arxiv-metadata-oai-snapshot.json'

def get_metadata():
    with open(data_file, 'r') as f:
        for line in f:
            yield line

def extract_year(paper_id):
    # https://arxiv.org/help/arxiv_identifier
    if paper_id.find('/') >= 0:
        # then in first naming scheme (1991 - March 2007)
        id_split = re.split(r'/', paper_id)
        my_year_end = int(id_split[1][:2])

        # add 1900 or 2000
        if my_year_end > 90:
            my_year = 1900 + my_year_end
        else:
            my_year = 2000 + my_year_end
    else:
        # then in second naming scheme (March 2007+)
        my_year = 2000 + int(paper_id[:2])
    
    return my_year

all_ids = []
all_titles = []
all_abstracts = []
all_categories = []

metadata = get_metadata()
for paper in tqdm(metadata):
    paper_dict = json.loads(paper)
    year = extract_year(paper_dict.get('id'))

    if 2010<=year<=2024:
        all_ids.append(paper_dict.get('id'))
        all_titles.append(paper_dict.get('title'))
        all_abstracts.append(paper_dict.get('abstract'))
        all_categories.append(paper_dict.get('categories').split())

2472872it [00:36, 68533.94it/s] 


In [3]:
print(len(all_ids), len(all_titles), len(all_abstracts), len(all_categories))

1893462 1893462 1893462 1893462


In [6]:
def flatten(xss):
    return [x for xs in xss for x in xs]

for categories in all_categories:
    for category in categories:

        if category in arxiv_categories:
            continue
            
        for x in arxiv_categories:
            if x in category:
                categories.append(x)

        categories.remove(category)
        
    categories = set(categories)

labels = set(flatten(all_categories))
display(labels)
display(len(labels))

{'astro-ph.CO',
 'astro-ph.EP',
 'astro-ph.GA',
 'astro-ph.HE',
 'astro-ph.IM',
 'astro-ph.SR',
 'cond-mat.dis-nn',
 'cond-mat.mes-hall',
 'cond-mat.mtrl-sci',
 'cond-mat.other',
 'cond-mat.quant-gas',
 'cond-mat.soft',
 'cond-mat.stat-mech',
 'cond-mat.str-el',
 'cond-mat.supr-con',
 'cs.AI',
 'cs.AR',
 'cs.CC',
 'cs.CE',
 'cs.CG',
 'cs.CL',
 'cs.CR',
 'cs.CV',
 'cs.CY',
 'cs.DB',
 'cs.DC',
 'cs.DL',
 'cs.DM',
 'cs.DS',
 'cs.ET',
 'cs.FL',
 'cs.GL',
 'cs.GR',
 'cs.GT',
 'cs.HC',
 'cs.IR',
 'cs.IT',
 'cs.LG',
 'cs.LO',
 'cs.MA',
 'cs.MM',
 'cs.MS',
 'cs.NA',
 'cs.NE',
 'cs.NI',
 'cs.OH',
 'cs.OS',
 'cs.PF',
 'cs.PL',
 'cs.RO',
 'cs.SC',
 'cs.SD',
 'cs.SE',
 'cs.SI',
 'cs.SY',
 'econ.EM',
 'econ.GN',
 'econ.TH',
 'eess.AS',
 'eess.IV',
 'eess.SP',
 'eess.SY',
 'gr-qc',
 'hep-ex',
 'hep-lat',
 'hep-ph',
 'hep-th',
 'math-ph',
 'math.AC',
 'math.AG',
 'math.AP',
 'math.AT',
 'math.CA',
 'math.CO',
 'math.CT',
 'math.CV',
 'math.DG',
 'math.DS',
 'math.FA',
 'math.GM',
 'math.GN',
 'math.G

155

In [7]:
data = pd.DataFrame({
    'ids': all_ids,
    'titles': all_titles,
    'abstracts': all_abstracts,
    'categories': all_categories
})
display(len(data))
data.head()

1893462

Unnamed: 0,ids,titles,abstracts,categories
0,1001.0001,On the structure of non-full-rank perfect codes,The Krotov combining construction of perfect...,"[cs.IT, math.IT]"
1,1001.0002,Gravity duals for logarithmic conformal field ...,Logarithmic conformal field theories with va...,"[hep-th, cond-mat.dis-nn, cond-mat.stat-mech, ..."
2,1001.0003,A landscape of non-supersymmetric AdS vacua on...,We construct new families of non-supersymmet...,[hep-th]
3,1001.0004,The Lie Algebraic Significance of Symmetric In...,Examples of symmetric informationally comple...,"[quant-ph, math-ph, math.CO, math.MP]"
4,1001.0005,Environmental dependence of 8um luminosity fun...,We aim to reveal environmental dependence of...,[astro-ph.CO]


In [8]:
from sklearn.preprocessing import MultiLabelBinarizer
import joblib

mlb = MultiLabelBinarizer()

output_data = mlb.fit_transform(data['categories'])
joblib.dump(mlb, '../src/utils/multilabelbinarizer.pkl')
multilabel_df = pd.DataFrame(output_data, columns=mlb.classes_)
data = data.merge(multilabel_df, how='left', left_index=True, right_index=True).drop('categories', axis=1)
data.head()

Unnamed: 0,ids,titles,abstracts,astro-ph.CO,astro-ph.EP,astro-ph.GA,astro-ph.HE,astro-ph.IM,astro-ph.SR,cond-mat.dis-nn,...,q-fin.RM,q-fin.ST,q-fin.TR,quant-ph,stat.AP,stat.CO,stat.ME,stat.ML,stat.OT,stat.TH
0,1001.0001,On the structure of non-full-rank perfect codes,The Krotov combining construction of perfect...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001.0002,Gravity duals for logarithmic conformal field ...,Logarithmic conformal field theories with va...,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1001.0003,A landscape of non-supersymmetric AdS vacua on...,We construct new families of non-supersymmet...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1001.0004,The Lie Algebraic Significance of Symmetric In...,Examples of symmetric informationally comple...,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1001.0005,Environmental dependence of 8um luminosity fun...,We aim to reveal environmental dependence of...,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
dfs = []

for category in tqdm(arxiv_categories):
    n_samples = len(data[data[category] == 1])
    if n_samples < 4500:
        dfs.append(data[data[category] == 1])
    else:
        dfs.append(data[data[category] == 1].sample(n=4500))

data = pd.concat(dfs)
display(len(data))
data.head()

100%|██████████| 155/155 [00:11<00:00, 13.55it/s]


623922

Unnamed: 0,ids,titles,abstracts,astro-ph.CO,astro-ph.EP,astro-ph.GA,astro-ph.HE,astro-ph.IM,astro-ph.SR,cond-mat.dis-nn,...,q-fin.RM,q-fin.ST,q-fin.TR,quant-ph,stat.AP,stat.CO,stat.ME,stat.ML,stat.OT,stat.TH
1853949,2403.0212,Evaluating extensions to LCDM: an application ...,We employ Bayesian Model Averaging (BMA) as ...,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26740,1005.4489,Possible use of self-calibration to reduce sys...,"By observing mergers of compact objects, fut...",1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1004752,1909.00003,Properties of the Circumgalactic Medium in Cos...,We investigate the impact of cosmic rays (CR...,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1575662,2211.02045,Fast and robust Bayesian Inference using Gauss...,We present the GPry algorithm for fast Bayes...,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,1,0,0
160133,1203.0063,HerMES: A Statistical Measurement of the Redsh...,The wide-area imaging surveys with the {\it ...,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Check for duplicates and shuffle the data
data = data.drop_duplicates(subset=['titles', 'abstracts']).reset_index(drop=True)
data = data.drop_duplicates(subset='titles').reset_index(drop=True)
data = data.drop_duplicates(subset='abstracts').reset_index(drop=True)
data = data.drop_duplicates(subset='ids').reset_index(drop=True)
data = data.sample(frac=1).reset_index(drop=True)
display(len(data))
data.head()

536914

Unnamed: 0,ids,titles,abstracts,astro-ph.CO,astro-ph.EP,astro-ph.GA,astro-ph.HE,astro-ph.IM,astro-ph.SR,cond-mat.dis-nn,...,q-fin.RM,q-fin.ST,q-fin.TR,quant-ph,stat.AP,stat.CO,stat.ME,stat.ML,stat.OT,stat.TH
0,1809.10336,Multi-task Learning for Financial Forecasting,Financial forecasting is challenging and att...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2305.11021,The Wisdom of Strategic Voting,We study the voting game where agents' prefe...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1704.00646,A correlation game for unsupervised learning y...,Much has been learned about plasticity of bi...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2104.03906,Explaining decisions made with AI: A workbook ...,"Over the last two years, The Alan Turing Ins...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1207.6361,Magneto-elastic coupling in the spin-Peierls g...,We report an ultrasonic study of the magneto...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
data.to_csv('../data/arxiv_data.csv', index=False)