In [1]:
import pandas as pd
import numpy as np
import html
import lxml.etree as etree
from collections import defaultdict

# Reading Data

In [2]:
# dataset_addr = "sample.xml"
dataset_addr = "dblp.xml"

In [3]:
parser = etree.XMLParser(recover=True)

In [4]:
tree = etree.parse(dataset_addr, parser=parser)

# Schema

In [None]:
s = set()
for n in tree.getroot():
    for a in n:
        s.add(a.tag)

In [59]:
s

{<cyfunction Entity at 0x11142ff50>,
 'address',
 'author',
 'booktitle',
 'cdrom',
 'chapter',
 'cite',
 'crossref',
 'editor',
 'ee',
 'isbn',
 'journal',
 'month',
 'note',
 'number',
 'pages',
 'publisher',
 'school',
 'series',
 'title',
 'url',
 'volume',
 'year'}

In [60]:
len(tree.getroot())

7256826

In [65]:
t = set()
for n in tree.getroot():
    t.add(n.tag)

In [66]:
t

{'article',
 'book',
 'incollection',
 'inproceedings',
 'mastersthesis',
 'phdthesis',
 'proceedings',
 'www'}

In [68]:
u = defaultdict(set)
for n in tree.getroot():
    for a in n:
        u[n.tag].add(a.tag)

In [69]:
u

defaultdict(set,
            {'phdthesis': {'author',
              'ee',
              'isbn',
              'month',
              'note',
              'number',
              'pages',
              'publisher',
              'school',
              'series',
              'title',
              'url',
              'volume',
              'year'},
             'book': {<cyfunction Entity at 0x11142ff50>,
              'author',
              'booktitle',
              'cdrom',
              'cite',
              'crossref',
              'editor',
              'ee',
              'isbn',
              'month',
              'note',
              'pages',
              'publisher',
              'school',
              'series',
              'title',
              'url',
              'volume',
              'year'},
             'mastersthesis': {'author',
              'ee',
              'note',
              'school',
              'title',
              'year'},
             

# Creating Dataframe

In [5]:
rows = []
for node in tree.getroot():
    if node.tag in ["inproceedings", "article"]:
        year = node.find("year")
        if year != None:
            year = year.text
        rows.append({
            "title": node.find("title").text,
            "authors": [author.text for author in node.findall("author")],
            "year": year,
            "type": node.tag,
        })

In [6]:
len(rows)

4667239

In [7]:
df = pd.DataFrame(rows)

In [8]:
df

Unnamed: 0,title,authors,year,type
0,Generative Artificial Intelligence.,"[Tijn van der Zant, Matthijs Kouw, Lambert Sch...",2011,inproceedings
1,Practical Introspection as Inspiration for AI.,[Sam Freed],2011,inproceedings
2,'Quantum Linguistics' and Searle's Chinese Roo...,"[John Mark Bishop, Slawomir J. Nasuto, Bob Coe...",2011,inproceedings
3,Feasibility of Whole Brain Emulation.,[Anders Sandberg],2011,inproceedings
4,The New Experimental Science of Physical Cogni...,[Fabio Bonsignorio],2011,inproceedings
...,...,...,...,...
4667234,Interactive Support for Non-Programmers: The R...,"[E. F. Codd, C. J. Date]",1974,article
4667235,Common Subexpression Identification in General...,[Patrick A. V. Hall],1974,article
4667236,Catchment classification by runoff behaviour w...,"[Rita Ley, Markus Casper, Hugo Hellebrand, Ral...",2011,article
4667237,Relational Completeness of Data Base Sublangua...,[E. F. Codd],1972,article


In [9]:
df["num_of_authors"] = df["authors"].map(lambda x: len(x))

# Cleaning up Data

In [10]:
df[df["num_of_authors"] == 0]

Unnamed: 0,title,authors,year,type,num_of_authors
57,(error),[],,article,0
58,(was never published),[],,article,0
59,…,[],,article,0
72,"The 1995 SQL Reunion: People, Project, and Pol...",[],1997,article,0
315,18. Workshop,[],1992,article,0
...,...,...,...,...,...
4661974,Continuous monitoring and the status quo effect.,[],2010,article,0
4662118,"Review of: ""Comparisons of three different met...",[],2010,article,0
4663561,The Next Killer Ap.,[],2006,article,0
4663799,Human-Computer Interaction in Radiotherapy Tar...,[],2011,article,0


In [11]:
df[df["year"].map(lambda x: x == None or len(x) != 4)]

Unnamed: 0,title,authors,year,type,num_of_authors
57,(error),[],,article,0
58,(was never published),[],,article,0
59,…,[],,article,0


In [12]:
df[df["title"] == None]

Unnamed: 0,title,authors,year,type,num_of_authors


In [13]:
df[df["title"].map(lambda x: type(x) != str)]

Unnamed: 0,title,authors,year,type,num_of_authors
420,,[Jutta Krei],1991,article,1
585,,[Noam Paz],1989,article,1
594,,[Carola Eschenbach],1988,article,1
7149,,"[Kaori Kobayashi, Daisuke Kitayama, Kazutoshi ...",2011,inproceedings,3
7201,,"[Junjun Yin, James D. Carswell]",2011,inproceedings,2
...,...,...,...,...,...
4661939,,"[Steve G. Sutton, Matthew Holt, Vicky Arnold]",2016,article,3
4664576,,"[Ladislav Marsik, Petr Martisek, Jaroslav Poko...",2018,article,9
4664689,,"[Mira Kim, Shao-Ting Wang, David A. Ostrowski,...",2016,article,5
4664736,,"[Shao-Ting Wang, Jennifer Jin, Pete Rivett, At...",2015,article,4


In [14]:
df = df[df["year"].map(lambda x: x != None and len(x) == 4)]

In [15]:
df = df[df["num_of_authors"] > 0]

In [16]:
df = df[df["title"] != None]

In [17]:
df = df[df["title"].map(lambda x: type(x) == str)]

In [18]:
df

Unnamed: 0,title,authors,year,type,num_of_authors
0,Generative Artificial Intelligence.,"[Tijn van der Zant, Matthijs Kouw, Lambert Sch...",2011,inproceedings,3
1,Practical Introspection as Inspiration for AI.,[Sam Freed],2011,inproceedings,1
2,'Quantum Linguistics' and Searle's Chinese Roo...,"[John Mark Bishop, Slawomir J. Nasuto, Bob Coe...",2011,inproceedings,3
3,Feasibility of Whole Brain Emulation.,[Anders Sandberg],2011,inproceedings,1
4,The New Experimental Science of Physical Cogni...,[Fabio Bonsignorio],2011,inproceedings,1
...,...,...,...,...,...
4667234,Interactive Support for Non-Programmers: The R...,"[E. F. Codd, C. J. Date]",1974,article,2
4667235,Common Subexpression Identification in General...,[Patrick A. V. Hall],1974,article,1
4667236,Catchment classification by runoff behaviour w...,"[Rita Ley, Markus Casper, Hugo Hellebrand, Ral...",2011,article,4
4667237,Relational Completeness of Data Base Sublangua...,[E. F. Codd],1972,article,1


# Save Dataset

In [19]:
pd_addr = "articles.pkl"

In [20]:
df.to_pickle(pd_addr)