## Parse papers

In [3]:
#!/usr/bin/python

# code modified from http://web.stanford.edu/~zlotnick/TextAsData/Web_Scraping_with_Beautiful_Soup.html

from bs4 import BeautifulSoup
from glob import glob

filenames = glob("arxiv_metadata/*.xml")


In [8]:
with open(filenames[0], "r") as f:
    print("".join(f.readlines()))

<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
  <link href="http://arxiv.org/api/query?search_query%3D0812.4044%20OR%201103.1778%20OR%201104.2373%20OR%201107.3342%20OR%201112.5309%20OR%201206.1106%20OR%201206.5533%20OR%201207.0580%20OR%201209.1557%20OR%201210.7495%20OR%201211.5063%20OR%201211.6581%20OR%201211.6950%20OR%201212.3023%20OR%201301.0802%20OR%201301.1942%20OR%201301.2444%20OR%201303.5960%20OR%201303.7220%20OR%201304.0806%20OR%201305.0445%20OR%201305.1422%20OR%201305.4778%20OR%201305.5306%20OR%201306.0386%20OR%201306.1849%20OR%201306.4793%20OR%201306.5860%20OR%201307.0048%20OR%201310.5568%20OR%201310.5796%20OR%201311.0989%20OR%201311.2492%20OR%201311.2524%20OR%201311.2901%20OR%201311.3959%20OR%201311.6531%20OR%201312.6034%20OR%201312.6229%20OR%201312.6947%20OR%201312.7219%20OR%201402.0240%20OR%201402.1298%20OR%201402.1754%20OR%201402.3044%20OR%201402.3337%20OR%201402.4303%20OR%201402.4893%20OR%201402.5874%20OR%201402.5876%26id_list%3D%26star

In [9]:
lesoups = []

for filename in filenames:
    with open(filename, "r") as f:
        lesoups.append(BeautifulSoup("".join(f.readlines()),"xml"))

In [132]:
import re

In [173]:
#for soup in lesoups:


papers = {}
for soup in lesoups:
    for entry in soup.find_all("entry"):
        paper = {}
        paper["link"] = entry.id.string
        paper["title"] = entry.title.string
        paper["summary"] = entry.summary.string

        paper["comment"] = entry.comment.string if entry.comment else None
        paper["doi"] = entry.doi.string if entry.doi else None
        paper["journalref"] = entry.journal_ref.string if entry.journal_ref else None

        res = re.findall("([019][0-9][0-1][0-9]\.[0-9]+)",entry.id.string)
        if len(res) != 1:
            print("ERROR wrong match in ", entry)
            break
        elif res in papers.keys():
            print("ERROR RES ALREADY IN HERE", entry)
            break
        paper["arxivid"] = res[0]
        authorsaffil = []
        for author in entry.find_all("author"):
            authorsaffil.append([author.find("name").string, author.affiliation.string if author.affiliation else None])
        paper["authorsaffil"] = authorsaffil

        temp = entry.find_all(["category", "primary_category"])
        if temp[0]["term"] == temp[1]["term"]:
            paper["categoryterms"] = [x["term"] for x in temp[1:]]
        else:
            paper["categoryterms"] = [x["term"] for x in temp]

        paper["published"] = entry.published.string
        papers[res[0]] = paper

papers
    #    for child in entry.descendants:
#        if child.name not in (None, "id", "updated", "published", "title", "summary", "author",
#                              "name", "link", "category", "primary_category", "comment"):
#            print(child.parent.name, child.name)
#            print("attrs",child.attrs)

# entry: doi
# entry: journal_ref
# author: affiliation
# author: name


{u'1602.03742': {'arxivid': u'1602.03742',
  'authorsaffil': [[u'Carlos Palma', None],
   [u'Augusto Salazar', None],
   [u'Francisco Vargas', None]],
  'categoryterms': [u'cs.HC', u'cs.CV'],
  'comment': None,
  'doi': None,
  'journalref': None,
  'link': u'http://arxiv.org/abs/1602.03742v1',
  'published': u'2016-02-11T14:22:26Z',
  'summary': u'  Automatic recognition of the quality of movement in human beings is a\nchallenging task, given the difficulty both in defining the constraints that\nmake a movement correct, and the difficulty in using noisy data to determine if\nthese constraints were satisfied. This paper presents a method for the\ndetection of deviations from the correct form in movements from physical\ntherapy routines based on Hidden Markov Models, which is compared to Dynamic\nTime Warping. The activities studied include upper an lower limbs movements,\nthe data used comes from a Kinect sensor. Correct repetitions of the activities\nof interest were recorded, as well

[<journal_ref>IEEE Transactions on Information Theory 62(4):2092--2099, 2016</journal_ref>,
 <journal_ref>THEORIA 31/1 (2016): 7-25</journal_ref>,
 <journal_ref>JLCL - Journal for Language Technology and Computational\n  Linguistics, 2015, 30 (1)</journal_ref>,
 <journal_ref>Annals of Probability 2016, Vol. 44, No. 2, 1107-1133</journal_ref>,
 <journal_ref>Bernoulli 2016, Vol. 22, No. 3, 1535-1571</journal_ref>]

In [222]:
# test = {"a":papers["1509.08535"], "b":papers["1602.03742"], "c":papers["1602.03742"]}
# tuple(test.values())

import json
import psycopg2




with open("login_info.json","r") as f:
    login_obj = json.load(f)["ML"]

myConnection = psycopg2.connect(host=login_obj['hostname'],
                                    user=login_obj['username'],
                                    password=login_obj['password'],
                                    dbname=login_obj['database'])


# cur = conn.cursor()
# cur.executemany("""INSERT INTO bar(first_name,last_name) VALUES (%(first_name)s, %(last_name)s)""", namedict)

In [223]:
with myConnection.cursor() as cur:
    cur.executemany(("INSERT INTO paper (title, summary, comment, arxivid, doi, journalref," +
                                        "link, authorsaffil, categoryterms, published) " +
                    "VALUES (%(title)s, %(summary)s, %(comment)s, %(arxivid)s, %(doi)s, " +
                            "%(journalref)s, %(link)s, %(authorsaffil)s, %(categoryterms)s, " +
                            "%(published)s )"), tuple(papers.values()))

                    #(%(first_name)s, %(last_name)s)""", namedict)
myConnection.commit()
myConnection.close()

In [210]:
# fix a dumb duplicate
for item in papers.values():
    if item["doi"] == "10.1145/1235":
        papers[item["arxivid"]]["doi"] = None
        print("woop")
        


#    print(("INSERT INTO paper (title, summary, comment, arxivid, doi, journalref," +
#                                        "link, authorsaffil, categoryterms, published) " +
#                    "VALUES (%(title)s, %(summary)s, %(comment)s, %(arxivid)s, %(doi)s, " +
#                            "%(journalref)s, %(link)s, %(authorsaffil)s, %(categoryterms)s, " +
#                            "%(published)s )") % item)

In [10]:
extraurl = "http://export.arxiv.org/api/query?search_query=id:1510.00331 OR id:1512.01124 OR id:1502.02590 OR id:1505.05007 OR id:1601.00909 OR id:1511.06085 OR id:1502.06922 OR id:1602.02697 OR id:1602.03822 OR id:1509.01277 OR id:1602.07362 OR id:1508.00330 OR id:1112.5309 OR id:1406.3284 OR id:1406.4729 OR id:1411.4389 OR id:1411.4952 OR id:1505.05612 OR id:1511.06881 OR id:1502.05698 OR id:1602.03822 OR id:1602.03822 OR id:1602.03822&start=0&max_results=30"

with open("newquery.xml", "r") as f:
    extrasoup = BeautifulSoup("".join(f.readlines()), "xml")

In [12]:
entries = extrasoup.find_all("entry")
len(entries)


# so, the only special addition here is doi... good to know
for entry in entries:
    for child in entry.descendants:
        if child.name not in (None, "id", "updated", "published", "title", "summary", "author",
                     "name", "link", "category", "primary_category", "comment"):
            print(child.parent.name, child.name)
            print("attrs",child.attrs)



20

In [15]:
import re
papers = {}


for entry in entries:
    paper = {}
    paper["link"] = entry.id.string
    paper["title"] = entry.title.string
    paper["summary"] = entry.summary.string

    paper["comment"] = entry.comment.string if entry.comment else None
    paper["doi"] = entry.doi.string if entry.doi else None
    
    res = re.findall("([019][0-9][0-1][0-9]\.[0-9]+)",entry.id.string)
    if len(res) != 1:
        print("ERROR wrong match in ", entry)
        break
    elif res in papers.keys():
        print("ERROR RES ALREADY IN HERE", entry)
        break
    paper["arxivid"] = res[0]
    authorsaffil = []
    for author in entry.find_all("author"):
        authorsaffil.append([author.find("name").string, author.affiliation.string if author.affiliation else None])
    paper["authorsaffil"] = authorsaffil

    temp = entry.find_all(["category", "primary_category"])
    if temp[0]["term"] == temp[1]["term"]:
        paper["categoryterms"] = [x["term"] for x in temp[1:]]
    else:
        paper["categoryterms"] = [x["term"] for x in temp]

    paper["published"] = entry.published.string
    papers[res[0]] = paper

In [17]:
# test = {"a":papers["1509.08535"], "b":papers["1602.03742"], "c":papers["1602.03742"]}
# tuple(test.values())

import json
import psycopg2

with open("login_info.json","r") as f:
    login_obj = json.load(f)["ML"]

myConnection = psycopg2.connect(host=login_obj['hostname'],
                                    user=login_obj['username'],
                                    password=login_obj['password'],
                                    dbname=login_obj['database'])

# journalrefs removed


In [18]:
with myConnection.cursor() as cur:
    cur.executemany(("INSERT INTO paper (title, summary, comment, arxivid, doi," +
                                        "link, authorsaffil, categoryterms, published) " +
                    "VALUES (%(title)s, %(summary)s, %(comment)s, %(arxivid)s, %(doi)s, " +
                            "%(link)s, %(authorsaffil)s, %(categoryterms)s, " +
                            "%(published)s )"), tuple(papers.values()))

In [19]:
myConnection.commit()
myConnection.close()