In [None]:
%pip install tqdm # for progress bar
%pip install requests-html lxml_html_clean

## Utils 

In [78]:
from requests_html import HTMLSession, Element

session = HTMLSession()

def parse_element(element: Element) -> dict[str, str]:
    '''
    Parse an Element and returns a dict
    '''
    data = {}
    # if there are a link must be an image link
    data['image'] = element.links.pop() if len(element.links) > 0 else None

    # now we get a list of key value pais as a string: (eg: "Fecha: 2011")
    # we have to again split them by ':' and create an dict
    k_v_list = element.text.split("\n")
    for kv_str in k_v_list:
        k, v = kv_str.split(':', 1)
        k, v = k.strip(), v.strip()
        data[k] = v
    return data


def get_total_pages(url: str) -> int:
    '''
    Extract the amount of pages from the first page (eg: '{url}/0')
    '''
    response = session.get(f'{url}/0')
    assert response.status_code == 200, '!error: webpage could not be loaded'

    pages_str = response.html.search('Registros encontrados : {} en {} páginas')[1]
    assert type(pages_str) is str, '!error: page number not extracted'
    return int(pages_str)


def parse_page(url: str, page: int) -> dict[str, str]:
    '''
    this returns a list of items present in the page
    '''
    # gets the page
    response = session.get(f'{url}/{page}')
    assert response.status_code == 200, '!error: webpage could not be loaded' 

    # gets list of items
    sel = 'ul.list-unstyled > li > div > div:first-child'
    items = response.html.find(sel)
    assert len(items) > 0, '!warn: no items present in the page'

    # parse list of items
    parsed_items = map(parse_element, items)
    return parsed_items

In [82]:
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import os
from tqdm import tqdm

MAX_THREADS = os.cpu_count()

def get_all(url: str, pages: int):
    results = [] 
    
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        parse_page_partial = partial(parse_page, url)
        for result in tqdm(executor.map(parse_page_partial, range(pages)), total=pages):
            results.append(result)

    flatted_results = [dic for item in results for dic in item]
    return flatted_results

In [10]:
import json

ENCODING = 'utf-8'

def save_json(filename: str, list: list[dict]):
    with open(f'data/{filename}.json', 'w', encoding=ENCODING) as f:
        json.dump(list, f, ensure_ascii=False, indent=4)

def load_json(filename: str):
    with open(f'data/{filename}.json', 'r', encoding=ENCODING) as f:
        return json.load(f)

In [36]:
%%time 

posters_url = 'https://bnjm.cu/catalogos/simple_catalog/todos/aproximada/0/todos/CARTELES'
posters_pages = get_total_pages(posters_url)
posters = get_all(url=posters_url, pages=posters_pages)

save_json(filename="posters", list=posters)

  0%|          | 1/649 [00:00<05:09,  2.10it/s]

In [13]:
%%time 

pictures_url = 'https://bnjm.cu/catalogos/simple_catalog/todos/aproximada/0/todos/FOTOS'
pictures_pages = get_total_pages(f'{pictures_url}/0')
pictures = get_all(url=pictures_url, pages=pictures_pages)

save_json(filename="pictures", list=pictures)

100%|██████████| 2251/2251 [04:57<00:00,  7.57it/s]


CPU times: total: 1min 47s
Wall time: 4min 58s


In [None]:
%%time

books_url   ='https://bnjm.cu/catalogos/simple_catalog/todos/aproximada/0/todos/LIBROS'
books_pages = get_total_pages(f'{books_url}/0')
books = get_all(url=books_url, pages=books_pages)

save_json(filename="books", list=books)

In [11]:
%%time

maps_url  = 'https://bnjm.cu/catalogos/simple_catalog/todos/aproximada/0/todos/MAPAS'
maps_pages = get_total_pages(f'{maps_url}/0')
maps = get_all(url=maps_url, pages=maps_pages)

save_json(filename="maps", list=maps)

ConnectTimeout: HTTPSConnectionPool(host='bnjm.cu', port=443): Max retries exceeded with url: /catalogos/simple_catalog/todos/aproximada/0/todos/MAPAS/0 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000024117C57A70>, 'Connection to bnjm.cu timed out. (connect timeout=None)'))

In [12]:
%%time

art_url = 'https://bnjm.cu/catalogos/simple_catalog/todos/aproximada/0/todos/ART'
art_pages = get_total_pages(f'{art_url}/0')
art = get_all(url=art_url, pages=art_pages)

save_json(filename="art", list=art)

  0%|          | 1/864 [00:21<5:02:59, 21.07s/it]


KeyboardInterrupt: 

In [None]:
%%time

music_url = 'https://bnjm.cu/catalogos/simple_catalog/todos/aproximada/0/todos/MUS'
music_pages = get_total_pages(f'{music_url}/0')
musics = get_all(url=music_url, pages=music_pages)

save_json(filename="musics", list=musics)

In [None]:
%%time

serial_url = 'https://bnjm.cu/catalogos/simple_catalog/todos/aproximada/0/todos/SERIADAS'
serial_pages = get_total_pages(f'{serial_url}/0')
serials = get_all(url=serial_url, pages=serial_pages)

save_json(filename="musics", list=musics)