# Extracting SOU's from KB-labb's API
This script shows different ways to extract Governmental Reports from KB-labb's API.

In [1]:
from kblab import Archive
import json
import requests
import pandas as pd
import re
import multiprocessing
import time
from requests.auth import HTTPBasicAuth

In [2]:
a = Archive('https://betalab.kb.se/', auth=('username', 'password'))
sou_ids = [sou_id for sou_id in a if "sou" in sou_id]

In [3]:
len(sou_ids)

7830

In [4]:
sou_ids.remove("sou-cover")

Extract SOU text, year and issue number using the kblab-client package:

In [None]:
def get_sou (sou_id):
    
    sou = {}
    p = a.get(sou_id)
    text = ''
    
    for part in json.load(p.get_raw('content.json')):
        text = text + "\n" + (part.get('content', ''))
        
    meta = json.load(p.get_raw('meta.json'))
    if 'seriesEnumeration' in meta:
        identifier = meta['seriesEnumeration']
        year = re.search(r"(?<=SOU )\d{4}(?=:\d+)", identifier).group()
        issue = re.search(r"(?<=SOU \d{4}:)\d+", identifier).group()
    else:
        identifier = meta['title']
        year = re.search(r"(?<=SOU )\d{4}(?=:\d+)", identifier).group()
        issue = re.search(r"(?<=SOU \d{4}:)\d+", identifier).group()
        
    sou['text'] = text
    sou['year'] = year
    sou['issue'] = issue
    
    return sou

Extract the same information as above using python requests:

In [None]:
def get_sou_re (sou_id):
    
    sou = {}
    for i in range(5):
        backoff_time = 0.1 * (2 ** i)

        sou_year = re.search(r"(?<=sou-)\d{4}(?=-\d+)", sou_id).group()
            
        if int(sou_year) <= 1999:

            content_structure = requests.get(
                f"https://betalab.kb.se/{sou_id}/content.json", auth=HTTPBasicAuth("username", "password"),
                )

            if content_structure.status_code == 200:
                text = ''
    
                for part in json.loads(content_structure.text):
                    try:
                        text = text + "\n" + part["content"]
                    except TypeError:
                        text = text + "\n" + " ".join(part["content"])
        
                meta_structure = requests.get(
                f"https://betalab.kb.se/{sou_id}/meta.json", auth=HTTPBasicAuth("username", "password"),
                )
                meta = json.loads(meta_structure.text)
            
                if 'seriesEnumeration' in meta:
                    identifier = meta['seriesEnumeration']
                    year = re.search(r"(?<=SOU )\d{4}(?=:\d+)", identifier).group()
                    issue = re.search(r"(?<=SOU \d{4}:)\d+", identifier).group()
                else:
                    identifier = meta['title']
                    year = re.search(r"(?<=SOU )\d{4}(?=:\d+)", identifier).group()
                    issue = re.search(r"(?<=SOU \d{4}:)\d+", identifier).group()
        
                sou['text'] = text
                sou['year'] = year
                sou['issue'] = issue
    
                return sou

            else:
                print(f"{sou_id} failed. Status code {content_structure.status_code}")
                time.sleep(backoff_time)

        

Extract SOU's where a document is a page instead of an issue, using python requests:

In [54]:
def get_sou_pages (sou_id):
    
    list_of_dicts = []
    for i in range(5):
        backoff_time = 0.1 * (2 ** i)

        sou_year = re.search(r"(?<=sou-)\d{4}(?=-\d+)", sou_id).group()
            
        if int(sou_year) >= 1990 and int(sou_year) < 2000:
            
            meta_structure = requests.get(
                f"https://betalab.kb.se/{sou_id}/meta.json", auth=HTTPBasicAuth("username", "password"),
                )
            meta = json.loads(meta_structure.text)
            
            if 'seriesEnumeration' in meta:
                identifier = meta['seriesEnumeration']
                year = re.search(r"(?<=SOU )\d{4}(?=:\d+)", identifier).group()
                issue = re.search(r"(?<=SOU \d{4}:)\d+", identifier).group()
            else:
                identifier = meta['title']
                year = re.search(r"(?<=SOU )\d{4}(?=:\d+)", identifier).group()
                issue = re.search(r"(?<=SOU \d{4}:)\d+", identifier).group()

            content_structure = requests.get(
                f"https://betalab.kb.se/{sou_id}/content.json", auth=HTTPBasicAuth("username", "password"),
                )
            
            if content_structure.status_code == 200:
                text = ''
    
                prev_page = '1'
                for part in json.loads(content_structure.text):
                    
                    page_nr = re.search(r"(?<=#[0-9]-)[0-9]{1,3}", part['@id']).group()
                    if prev_page == page_nr:
                        try:
                            text = text + "\n" + part["content"]
                        except TypeError:
                            text = text + "\n" + " ".join(part["content"])
                    else:
                        sou = {'year': year, 'issue':issue, 'page': prev_page, 'text': text}
                        list_of_dicts.append(sou)
                        text = ''
                        try:
                            text = text + "\n" + part["content"]
                        except TypeError:
                            text = text + "\n" + " ".join(part["content"])
                        
                    prev_page = page_nr
                
                sou = {'year': year, 'issue':issue, 'page': prev_page, 'text': text}
                list_of_dicts.append(sou)
                
                return pd.DataFrame(list_of_dicts)

            else:
                print(f"{sou_id} failed. Status code {content_structure.status_code}")
                time.sleep(backoff_time)

In [55]:
# the problem is that some IDs don't have any file attached to them
# the error is 404 page not found so there is nothing to do about it

pool = multiprocessing.Pool()
sous = pool.map(get_sou_pages, sou_ids)
pool.close()

In [56]:
sous_old = [x for x in sous if x is not None]

In [None]:
# use this when you are collecting by issue
df = pd.DataFrame(sous_old)

In [57]:
#use this when you are collecting by page
df = pd.DataFrame().append(sous_old)

In [58]:
df

Unnamed: 0,year,issue,page,text
0,1990,1,1,\nFÖRETAGSFÖRVÄRV\nI\nSVENSKT NÄRINGSLIV\n\nBI...
1,1990,1,3,\n&Ö Statens offentliga utredningar\n\nBYyEYs|...
2,1990,1,4,\n \n\nAllmänna Förlaget har utgivit en biblio...
3,1990,1,5,\n \n\nSOU 1990:1\n\nÄgarutredningens förord\n...
4,1990,1,7,\n \n\nSOU 1990:1\n\nInnehåll\nBilaga I Skatte...
...,...,...,...,...
95,1999,98,101,\n \n\nStatens offentliga utredningar 1999\n\n...
96,1999,98,102,\nStatens offentliga utredningar 1999\n\nKrono...
97,1999,98,103,\n \n\nStatens offentliga utredningar 1999\n\n...
98,1999,98,104,\n \n\nStatens offentliga utredningar 1999\n\n...


In [59]:
df.groupby('year').count()

Unnamed: 0_level_0,issue,page,text
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1990,24501,24501,24501
1991,24940,24940,24940
1992,29288,29288,29288
1993,27395,27395,27395
1994,34879,34879,34879
1995,31123,31123,31123
1996,38589,38589,38589
1997,41570,41570,41570
1998,34151,34151,34151
1999,34869,34869,34869


In [60]:
df.to_csv("sou_1990-1999_pages.csv")