In [1]:
import requests
from bs4 import BeautifulSoup

In [41]:
def get_years():
    _TOP = "https://www.speech.kth.se/qpsr/"
    _TOP_HTML = requests.get(_TOP)
    assert _TOP_HTML.status_code == 200

    _TOP_SOUP = BeautifulSoup(_TOP_HTML.text, 'html.parser')
    by_years = _TOP_SOUP.find_all("select", {"name": "year"})
    years = [opt.text for by_year in by_years for opt in by_year.find_all("option")]
    return years

In [120]:
def read_page(page):
    if page.startswith("http"):
        url = page
        year = page[len(page)-4:]
    else:
        url = f"https://www.speech.kth.se/qpsr/show_by_year.php?year={page}"
        year = page

    req = requests.get(url)
    assert req.status_code == 200

    soup = BeautifulSoup(req.text, 'html.parser')
    pubs = []
    for pub in soup.find_all("p", class_="publications_apa_entry"):
        data = {}
        data["year"] = year
        raw_text = pub.text
        author = pub.find("span", class_="publications_apa_author")
        if author.text == ", . (Ed.).":
            data["author"] = ""
        else:
            data["author"] = author.text
        raw_text = raw_text.replace(author.text, "").lstrip()
        if not raw_text.startswith(f"({year})."):
            raise Exception(f"Expected year {year}, but got {raw_text[1:5]} - " + pub.text)
        raw_text = raw_text[8:]
        pub_title = pub.find("span", class_="publications_apa_title")
        data["publication_full"] = pub_title.text
        pub_pieces = pub_title.text.split(", ")
        if pub_pieces[-1].isdigit():
            data["volume"] = pub_pieces[-1]
            data["publication"] = ", ".join(pub_pieces[0:-1])
        pub_title_start = raw_text.find(pub_title.text)
        pub_title_end = pub_title_start + len(pub_title.text)
        data["title"] = raw_text[0:pub_title_start].strip()
        if data["title"].endswith(". In"):
            data["title"] = data["title"][0:-3]
        for pdf_link in pub.find_all("a"):
            if pdf_link is None or not pdf_link.has_attr("href"):
                print("Missing link: " + pub.text)
            else:
                if pdf_link["href"].endswith("pdf"):
                    data["pdf"] = pdf_link["href"]
                else:
                    if pdf_link.has_attr("onclick"):
                        abs_start = pdf_link["onclick"].find("abstract_")
                        abs_end = pdf_link["onclick"][abs_start:].find("'")
                        abs_id = pdf_link["onclick"][abs_start:abs_start+abs_end]
                        abs_soup = soup.find("p", {"id": abs_id})
                        abs_text = abs_soup.text.strip()
                        if abs_text.startswith("Abstract:"):
                            abs_text = abs_text[9:].strip()
                        abs_text = abs_text.replace("\r\n", " ").replace("\r", " ").replace("\n", " ")
                        data["abstract"] = abs_text
        raw_text = raw_text[pub_title_end:].strip()
        if raw_text.endswith(" [pdf]"):
            raw_text = raw_text[0:-6]
        if raw_text[-1:] == ".":
            raw_text = raw_text[0:-1]
        if ", " in raw_text:
            if raw_text.startswith("(pp."):
                parts = raw_text.split("). ")
                data["pages"] = parts[0][5:]
                # manual fix
                if data["pdf"].endswith("/3597.pdf"):
                    data["volume"] = "51"
                    data["editiion"] = "1"
            else:
                parts = raw_text.split(", ")
                if parts[0].startswith("("):
                    to_mark = parts[0].find(")")
                    data["edition"] = parts[0][1:to_mark]
                if " [abstract]" in parts[1]:
                    data["pages"] = parts[1].replace(" [abstract]", "")
                    if data["pages"].endswith("."):
                        data["pages"] = data["pages"][0:-1]
                else:
                    data["pages"] = parts[1]
        pubs.append(data)
    return pubs


In [121]:
read_page("2011")

[{'year': '2011',
  'author': 'Blomberg, M.',
  'publication_full': 'TMH-QPSR Vol. 51, Fonetik 2011',
  'title': 'Model space size scaling for speaker adaptation.',
  'pdf': 'http://www.speech.kth.se/prod/publications/files/3597.pdf',
  'pages': '77-80',
  'volume': '51',
  'editiion': '1'},
 {'year': '2011',
  'author': 'Eklund, R., Peters, G., Ananthakrishnan, G., & Mabiza, E.',
  'publication_full': 'TMH-QPSR, 51',
  'volume': '51',
  'publication': 'TMH-QPSR',
  'title': 'An acoustic analysis of lion roars. I: Data collection and spectrogram and waveform analyses.',
  'pdf': 'http://www.speech.kth.se/prod/publications/files/3576.pdf',
  'edition': '1',
  'pages': '1-4'},
 {'year': '2011',
  'author': '',
  'publication_full': 'TMH-QPSR, 51',
  'volume': '51',
  'publication': 'TMH-QPSR',
  'title': 'Parent-child interaction: Relationship between pause duration and infant vocabulary at 18 months.',
  'pdf': 'http://www.speech.kth.se/prod/publications/files/3603.pdf',
  'edition': '1

In [115]:
import json
all = []
for year in get_years():
    all += read_page(year)
with open("qpsr.json", "w") as out:
    out.write(json.dumps(all, indent=4))

In [86]:
read_page("2011")

(1), 53-55. [abstract] [pdf]
(1), 57-60. [abstract] [pdf]
Missing link: Suomi, K., Meister, E., & Ylitalo, R. (2011). Non-contrastive durational patterns in two quantity languages. TMH-QPSR, 51(1), 61-64.


[{'year': '2011',
  'author': 'Blomberg, M.',
  'publication_full': 'TMH-QPSR Vol. 51, Fonetik 2011',
  'title': 'Model space size scaling for speaker adaptation. In',
  'pdf': 'http://www.speech.kth.se/prod/publications/files/3597.pdf',
  'edition': 'pp. 77-80',
  'pages': 'Music and Hearing and Centre for Speech Technology (CTT)'},
 {'year': '2011',
  'author': 'Eklund, R., Peters, G., Ananthakrishnan, G., & Mabiza, E.',
  'publication_full': 'TMH-QPSR, 51',
  'volume': '51',
  'publication': 'TMH-QPSR',
  'title': 'An acoustic analysis of lion roars. I: Data collection and spectrogram and waveform analyses.',
  'pdf': 'http://www.speech.kth.se/prod/publications/files/3576.pdf',
  'edition': '1',
  'pages': '1-4'},
 {'year': '2011',
  'author': '',
  'publication_full': 'TMH-QPSR, 51',
  'volume': '51',
  'publication': 'TMH-QPSR',
  'title': 'Parent-child interaction: Relationship between pause duration and infant vocabulary at 18 months.',
  'pdf': 'http://www.speech.kth.se/prod/pu