<a href="https://colab.research.google.com/github/scarfboy/wetsuite-dev/blob/main/examples/datacollect_koop_repos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## This notebook's goal


Figuring out how to get data out of EUR-Lex. 

Currently aimed specifically at the court judgments, and then mainly the text.

There are [a few different ways to access different parts of EUR-Lex data](https://eur-lex.europa.eu/content/welcome/data-reuse.html),
including a RESTful API, a SOAP API (requires registration), and a SPARQL endpoint.

Probably the most flexible is the SPARQL endpoint,
particularly when looking for specific selections of documents, specific relations, and such.
At the same time, SPARQL presents a bit of a learning curve unless you're already hardcore into RDF.

<!-- -->

SPARQL results refer to a work that is mostly the content text as HTML, e.g. http://publications.europa.eu/resource/cellar/1e3100ce-8a71-433a-8135-15f5cc0e927c.0002.02/DOC_1
Actually, the public-facing web page describing the thing (by CELEX), e.g. https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX%3A61996CJ0080
gives even better detail,
- links to the underlying document
- ...for all translated languages
- the text
- more metadata, like classification, related documents

...so for first experiments, and before learning SPARQL, we could read of details from there.
If we do, we still need a source of CELEX identifers to know what to fetch. The SPARQL endpoint is still quite useful for that.

In [1]:
import random, pprint, json, random, importlib

import tqdm

import wetsuite.helpers.localdata
import wetsuite.helpers.etree
import wetsuite.helpers.eurlex
import wetsuite.helpers.net

# Judgments

In [2]:
judgment_celexes = wetsuite.helpers.localdata.LocalKV('eurlex_judg_celex_workid.db', key_type=str,value_type=str)   # stores CELEX -> work id       (mostly just for the CELEX)
judgment_docs_en = wetsuite.helpers.localdata.LocalKV('eurlex_judg_en.db', key_type=str,value_type=bytes)           # stores url -> html document
judgment_docs_nl = wetsuite.helpers.localdata.LocalKV('eurlex_judg_nl.db', key_type=str,value_type=bytes)           # stores url -> html document

## Fetch identifiers and documents

In [None]:
judg_dict = wetsuite.helpers.eurlex.fetch_by_resource_type('JUDG') # as of this writing there are 27K results (roughly 4GB worth of HTML)
judg_dict

In [5]:
# fetch just the fact that the CELEX identifiers exist   (also the workid they point to, though we don't use that yet)
for work in judg_dict['results']['bindings']:
    try:
        celex  = work['celex']['value']
        workid = work['work']['value']
        judgment_celexes.put(celex, workid)
        #judgment_celexes.put(celex, workid, commit=False)  
    except KeyError as ke:
        print( 'missing %s: %s'%(str(ke), work) )
#judgment_celexes.commit()

In [1]:
# fetch the web pages for all those CELEXes, for EN

pbar = wetsuite.helpers.notebook.progress_bar( max=len(judgment_celexes), description='fetching pages...')
count_cached, count_fetched = 0, 0

for celex in judgment_celexes:
    # the /ALL/ page gives more metadata than e.g. AUTO, TXT, though we might be interested in fetching specific-language 
    if 1:
        url = 'https://eur-lex.europa.eu/legal-content/NL/ALL/?uri=CELEX:%s'%celex
        try:
            _, was_cached = wetsuite.helpers.localdata.cached_fetch( judgment_docs_nl, url )
            if was_cached:
                count_cached += 1
            else:
                count_fetched += 1
        except Exception as e:
            print( e, url )

    if 0:
        url = 'https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:%s'%celex
        try:
            wetsuite.helpers.localdata.cached_fetch( judgment_docs_en, url )
        except Exception as e: # it seems the server will report overloads as 404, so running it another time should
            print( e, url )
    
    pbar.value += 1
    pbar.description = f'{count_fetched} fetched, {count_cached} cached'

NameError: name 'wetsuite' is not defined

### Test parsing 

In [4]:
# debug, don't store yet
importlib.reload(wetsuite.helpers.eurlex)

for url in random.sample( judgment_docs_nl.keys(), 10 ): # pick a bunch of random documents, 
    random_doc = judgment_docs_nl[ url ]
    try:
        #print(url)
        parsed = wetsuite.helpers.eurlex.extract_html(random_doc)   # that function is where most of the scraping code sits
        #pprint.pprint( parsed['text'] )
    except Exception as e:
        print( url )
        raise

Did that give good text and not error out?   Then we can probably run it on the whole set and store the results.

In [6]:
parsed_store = wetsuite.helpers.localdata.LocalKV('eurlex_parsed.db', key_type=str,value_type=str)    # stores CELEX -> json as str

In [None]:
# parse and store
importlib.reload(wetsuite.helpers.eurlex)

keys = judgment_docs_nl.keys()

force = False
random.shuffle(keys)
lk = len(keys)
for url in tqdm.tqdm( keys, unit='page' ):
    if url not in parsed_store  or  force:
        docbytes = judgment_docs_nl[ url ]
        try:
            parsed = wetsuite.helpers.eurlex.extract_html( docbytes )
            parsed_store.put( url, json.dumps( parsed ) )
        except Exception as e:
            print( url )
            pprint.pprint( parsed )
            raise

# Regulations

In [5]:
reg_celexes = wetsuite.helpers.localdata.LocalKV('eurlex_reg_celex_workid.db', key_type=str,value_type=str)     # stores CELEX -> work id       (mostly just for the CELEX)
reg_docs_en = wetsuite.helpers.localdata.LocalKV('eurlex_reg_en.db', key_type=str,value_type=bytes)   # stores url -> html document
reg_docs_nl = wetsuite.helpers.localdata.LocalKV('eurlex_reg_nl.db', key_type=str,value_type=bytes)   # stores url -> html document

In [3]:
# Fetch current list
reg_dict = wetsuite.helpers.eurlex.fetch_by_resource_type('REG') # as of this writing there are 130K results (roughly GB worth of HTML)

In [None]:
# take that fetched state and update (mainly) the fact that the CELEX identifiers exist   (also the workid they point to, though we don't use that yet)
for work in reg_dict['results']['bindings']:
    try:
        celex  = work['celex']['value']
        workid = work['work']['value']
        reg_celexes.put(celex, workid, commit=False)  
    except KeyError as ke:
        print( 'missing %s: %s'%(str(ke), work) )
reg_celexes.commit()

In [3]:
len(reg_docs_nl), len(reg_celexes)

(123220, 130031)

In [5]:
# fetch the web pages for all those CELEXes, for EN
import tqdm, random

kk = list(reg_celexes.keys())
#random.shuffle(kk)

for celex in tqdm.tqdm(kk):
    # the /ALL/ page gives more metadata than e.g. AUTO, TXT, though we might be interested in fetching specific-language 
    if 1:
        url = 'https://eur-lex.europa.eu/legal-content/NL/ALL/?uri=CELEX:%s'%celex
        try:
            wetsuite.helpers.localdata.cached_fetch( reg_docs_nl, url )
            #print(url)
        except Exception as e:
            print( e, url )

    if 0:
        url = 'https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:%s'%celex
        try:
            wetsuite.helpers.localdata.cached_fetch( reg_docs_en, url )
        except Exception as e: # it seems the server will report overloads as 404, so running it another time should
            print( e, url )

100%|██████████| 130031/130031 [4:54:41<00:00,  7.35it/s]   


In [10]:
# test parsing again
#importlib.reload(wetsuite.helpers.eurlex)

selection = random.sample( reg_docs_nl.keys(), 1000 )

for url in tqdm.tqdm(selection): # pick 100 random documents
    random_doc = reg_docs_nl.get( url )
    try:
        #print(url)
        parsed = wetsuite.helpers.eurlex.extract_html(random_doc)   # that function is where most of the scraping code sits
        if random.uniform(0,1)<0.05:
            pprint.pprint( parsed )
        #pprint.pprint( parsed['text'] )
    except Exception as e:
        print( url )
        raise

  0%|          | 0/1000 [00:00<?, ?it/s]

https://eur-lex.europa.eu/legal-content/NL/ALL/?uri=CELEX:31973R0879





TypeError: uniform() missing 2 required positional arguments: 'a' and 'b'