# Step 01 - Term in office downloader

This first notebook is built to download all the publication page links on the https://www.congreso.es page.

The process does not get the pages with the actual content, just the links to the pages with the content. This first step gets the json files that point to the pages from the search engine of the site. We will need to download the HTML pages in a following step.

It will generate many files. A csv file with all the terms in the `./data` folder and a file for each tearm in the `./data/terms` folder, along with all downloaded json files.

In [101]:
# FUNCTIONS block

import pandas as pd

def get_df_from_congress_json(list)->pd.DataFrame:
    """
    get_df_from_pubmed_json(list) takes a list of json objects from the congress
    and returns a pandas dataframe with the list of documents and the link to
    the file in the `url` field.
    """
    temp = []
    page_base = 'https://www.congreso.es/busqueda-de-publicaciones?p_p_id=publicaciones&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&_publicaciones_mode=mostrarTextoIntegro&_publicaciones_legislatura={}&_publicaciones_texto=&_publicaciones_id_texto={}'

    for s in list:
        if(s.startswith('documento')):
            temp.append(list[s])

    page_no = list['paginaActual']
    #doc_len = list['publicaciones_encontradas']
    term = list['legislatura']

    print(page_no, end='')

    df = pd.DataFrame(temp)

    df['url'] = df.apply(lambda x: page_base.format(term, (f'{x["cve"]}') if 'cve' in x else (f'{x["texto_integro"][x["texto_integro"].rfind("+")+1:]}') ),axis=1)
    df['term']=term

    return df

def get_term_json(term:str, page:int):
  import json
  import urllib3
  import os
  import io

  pagequery = "https://www.congreso.es/publicaciones-organo?p_p_id=publicaciones&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=filtrarListado&p_p_cacheability=cacheLevelPage&_publicaciones_seccion=Congreso&_publicaciones_descOrg=Pleno-y-Diputacion-Permanente&_publicaciones_publicacion=D"
  http = urllib3.PoolManager()
  headers = { 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
              'accept':'application/json, text/javascript, */*; q=0.01'
              }

  pageform = { "_publicaciones_legislatura":term,
            "_publicaciones_comision":'',
            "_publicaciones_seccion":'',
            "_publicaciones_fromOrganos":1,
            "_publicaciones_paginaActual":page
            }

  r = http.request_encode_body('POST',pagequery,headers=headers,fields=pageform,encode_multipart=False)
  data=r.data.decode('utf-8')
  try:
    os.makedirs('data/terms',exist_ok=True)
    with io.open(file=f"data/terms/term_{term}_{page}.json",mode='w') as file:
      file.write(data)
  except Exception as e:
    print(f"could not write file data/terms/term_{term}_{page}.json: {e}")

  return json.loads(data)

def get_all_pages_for_a_term(term:str)->pd.DataFrame:
    """
    get_all_pages_for_a_term(term:int)->pd.DataFrame takes a term (legislature)
    and returns a dataframe with all the documents in that term.
    """
    import pandas as pd
    import os
    
    term_ds = None
    nextPage = 1
    while nextPage:
      list = get_term_json(term, nextPage)
      ds = get_df_from_congress_json(list)
      if term_ds is None:
        term_ds = ds
      else:
        term_ds = pd.concat([term_ds,ds])
      lastDoc=int(list['paginacion']['docs_fin'])
      totalDocs=int(list['publicaciones_encontradas'])
      if(lastDoc<totalDocs):
        nextPage+=1
        print('.',end="")
      else:
        nextPage=0
    try:
      os.makedirs('data/terms',exist_ok=True)
      term_ds.to_csv(f"data/terms/term_{term}.csv",index=False)
    except Exception as e:
      print(f"could not write file data/terms/term_{term}.csv: {e}")
    return term_ds

def get_all_terms(max:int=14)->pd.DataFrame:
    """
    get_all_terms(max:int=14)->pd.DataFrame takes a maximum term and returns all
    the documents in all the terms (from the 5th that is the one that starts
    having raw text data).
    """
    import pandas as pd
    full_ds = None
    for i in range(5,max+1):
      print(f"Getting term {i}...",end="")
      ds = get_all_pages_for_a_term(i)
      if full_ds is None:
        full_ds = ds
      else:
        full_ds = pd.concat([full_ds,ds])
      print(f"Done. {len(full_ds)} documents.")
    return full_ds
    

In [102]:
##### Constants

max_term=14

In [103]:
import os

full_ds = get_all_terms(max_term)

os.makedirs('data', exist_ok=True)
full_ds.to_csv(f"data/full_term_up_to_{max_term}.csv", index=False)


Getting term 5...1.2.3.4.5.6.7.8.9.10Done. 197 documents.
Getting term 6...1.2.3.4.5.6.7.8.9.10.11.12.13.14.15Done. 483 documents.
Getting term 7...1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16Done. 793 documents.
Getting term 8...1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16Done. 1108 documents.
Getting term 9...1.2.3.4.5.6.7.8.9.10.11.12.13.14.15Done. 1390 documents.
Getting term 10...1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16Done. 1705 documents.
Getting term 11...1Done. 1720 documents.
Getting term 12...1.2.3.4.5.6.7.8.9.10Done. 1905 documents.
Getting term 13...1Done. 1920 documents.
Getting term 14...1.2.3.4.5.6.7.8.9.10.11.12Done. 2141 documents.
