In [2]:
import os
import io
import csv
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd

In [3]:
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

publisher = "Generalitat+de+Catalunya"
type_file = "CSV"
# Para mas palabras de busquedas incluir (+) en los espacios covid+19
words_search = "covid"

domain = "https://datos.gob.es"
url_list = "https://datos.gob.es/es/catalogo?theme_id=salud&publisher_display_name="+publisher+"&sort=metadata_created+desc&res_format_label="+type_file+"&q="+words_search+"&_publisher_display_name_limit=0"

currentDir = os.path.dirname("gob_dataset.csv")
filename = "gob_dataset.csv"
filePath = os.path.join(currentDir, filename)

header_list = ['title', 'publish', 'entity', 'license_type', 'url_csv', 'tags', 'creation_date', 'update_date', 'insert_date_time']
# header_list = ['title', 'publish', 'desc_short', 'entity', 'license_type', 'desc_long', 'url_csv', 'tags', 'creation_date', 'update_date', 'insert_date_time']

In [4]:
def _clear_word(word):
    word = word.replace(" ", "")
    word = word.replace("\n", "")
    return word.strip()


def _clear_salto_linea(word):
    word = word.replace("\n", "")
    return word.strip()


def _replace_word(word):
    return word.replace('?accessType=DOWNLOAD', '')


def _connect_url_bs4(url_con):
    page = requests.get(url_con, verify=False)
    html_page = page.content
    bs = BeautifulSoup(html_page, features="html.parser")
    return bs


def _write_file_csv(rows_all):
    with open(filePath, 'w', newline='\n') as csvFile:
        writer = csv.writer(csvFile, delimiter='|')
        writer.writerow(header_list)
        for line in rows_all:
            writer.writerow(line)


def _read_csv_pandas():
    return pd.read_csv(filePath, delimiter='|')


def _read_csv_pandas_url(url):
    s = requests.get(url).content
    return pd.read_csv(io.StringIO(s.decode('utf-8')))


def _read_csv_test():
    print("\n TEST read CSV")
    with open(filePath, "r", newline='\n') as f:
        reader = csv.reader(f, delimiter='|')
        for line in reader:
            print(line)


def _load_web_scraping():
    soup = _connect_url_bs4(url_list)
    table = soup.findAll('li', attrs={'class': 'dataset-item dge-list--elm'})

    rows_all = []

    for row in table:
        list_of_rows = []
        titulo_r = row.find('strong', attrs={'class': 'dge-list__title dataset-heading'})
        titulo_text_r = _clear_salto_linea(titulo_r.get_text())
        list_of_rows.append(str(titulo_text_r))
        print("[title          ] "+titulo_text_r)

        titulo_ref_r = _clear_word(titulo_r.find('a', href=True)['href'])
        publicado = row.find('span', attrs={'class': 'publisher-title'})
        publicado_text = _clear_salto_linea(publicado.get_text())
        list_of_rows.append(publicado_text)
        print("[publish       ]  " + publicado_text)

        desc_short = row.find('div', attrs={'class': 'dge-list__desc'})
        desc_short_text = desc_short.get_text()
        # list_of_rows.append(str(desc_short))
        # print("[desc_short      ]  " + desc_short_text)

        url_detail = domain+titulo_ref_r
        soup_detail = _connect_url_bs4(url_detail)
        detail = soup_detail.find('div', attrs={'class': 'module-content'})
        publicado_admin = detail.find('section', attrs={'class': 'publisher'})
        publicado_admin_l = publicado_admin.findAll('div', attrs={'class': 'dataset-metadata'})[1:]
        entity = _clear_salto_linea(publicado_admin_l[0].find('span').get_text())
        list_of_rows.append(entity)
        print("[entity ]  " + entity)

        license_type = detail.find('section', attrs={'class': 'license'})
        license_type_l = license_type.findAll('div', attrs={'class': 'dataset-metadata'})[0:]
        license_type_text = _clear_word(license_type_l[0].find('span').get_text())
        list_of_rows.append(license_type_text)
        print("[license_type    ]  " + license_type_text)

        desc_long = detail.find('section', attrs={'class': 'description'})
        desc_long_text = desc_long.find('div', attrs={'class': 'notes embedded-content'}).get_text()
        # list_of_rows.append(str(desc_long_text))
        # print("[desc_long       ]  " + desc_long_text)

        file_csv = detail.find('section', attrs={'class': 'resources', 'id': 'dataset-resources'})
        file_csv_l = file_csv.findAll('ul', attrs={'class': 'resource-list'})
        for csvl in file_csv_l:
            files_link = csvl.findAll('li', attrs={'class': 'resource-item'})
            for csv_link in files_link:
                csv_link_button = csv_link.find('div', attrs={'class': 'btn-group'})
                csv_link_file = _replace_word(csv_link_button.find('a', href=True)['href'])
                csv_link_kind = csv_link.find('div', attrs={'class': 'resource-item format'}).get_text()
                if _clear_word(csv_link_kind) == type_file:
                    list_of_rows.append(csv_link_file)
                    print("[url_csv    ] "+csv_link_file)

        tags = detail.find('section', attrs={'class': 'tags'})
        tags_l = tags.findAll('ul', attrs={'class': 'tag-list'})
        tags_name = []
        for tag in tags_l:
            t_name_g = tag.findAll('li')
            for n in t_name_g:
                t_name = _clear_word(n.find('a').get_text())
                tags_name.append(t_name)
        list_of_rows.append(str(tags_name))
        print("[tags        ] "+str(tags_name))

        date_info_s = detail.find('section', attrs={'class': 'additional-info'})
        date_info_d = date_info_s.find('div', attrs={'class': 'additional-info__content'})
        date_info_t = date_info_d.find('table')
        date_info_tb = date_info_t.find('tbody')
        date_info_tr = date_info_tb.findAll('tr')
        creation_date = _clear_salto_linea(date_info_tr[0].find('td').get_text())
        list_of_rows.append(creation_date)
        print("[creation_date       ]" + creation_date)

        update_date = _clear_salto_linea(date_info_tr[1].find('td').get_text())
        list_of_rows.append(update_date)
        print("[update_date         ]" + update_date)

        now = datetime.now()
        insert_date_time = now.strftime("%m/%d/%Y-%H:%M:%S")
        list_of_rows.append(str(insert_date_time))
        print("[insert_date_time    ]" + insert_date_time)

        rows_all.append(list_of_rows)
    return rows_all

# Pruebas sobre el dataset generado.

In [5]:
from IPython.display import display, HTML
pd.get_option("display.precision")
pd.set_option("display.max_colwidth", 200)

In [6]:
# _write_file_csv(_load_web_scraping())

df = _read_csv_pandas()
display(df)
# display(HTML(df.to_html()))

Unnamed: 0,title,publish,entity,license_type,url_csv,tags,creation_date,update_date,insert_date_time
0,Datos semanales de COVID-19 por áreas de gestión asistenciales (AGA),Generalitat de Catalunya,Administración Autonómica,http://opendatacommons.org/licenses/by/1.0/,https://analisi.transparenciacatalunya.cat/api/views/k7cw-sg3w/rows.csv,"['coronavirus', 'covid19', 'covid-19', 'epidemiologia', 'salut']",2/08/2020 22:00 (UTC),28/10/2020 23:00 (UTC),11/07/2020-15:11:09
1,Datos diarios de COVID-19 por áreas de gestión asistenciales (AGA),Generalitat de Catalunya,Administración Autonómica,http://opendatacommons.org/licenses/by/1.0/,https://analisi.transparenciacatalunya.cat/api/views/dmzh-fz47/rows.csv,"['coronavirus', 'covid19', 'covid-19', 'epidemiologia', 'salut']",2/08/2020 22:00 (UTC),28/10/2020 23:00 (UTC),11/07/2020-15:11:09
2,Datos semanales de COVID-19 por comarca,Generalitat de Catalunya,Administración Autonómica,http://opendatacommons.org/licenses/by/1.0/,https://analisi.transparenciacatalunya.cat/api/views/jvut-jxu8/rows.csv,"['coronavirus', 'covid19', 'covid-19', 'epidemiologia', 'salut']",2/08/2020 22:00 (UTC),28/10/2020 23:00 (UTC),11/07/2020-15:11:09
3,Registro de defunciones por COVID-19 en Catalunya. Segregación por sexo y com...,Generalitat de Catalunya,Administración Autonómica,http://opendatacommons.org/licenses/by/1.0/,https://analisi.transparenciacatalunya.cat/api/views/uqk7-bf9s/rows.csv,"['coronavirus', 'covid19', 'covid-19', 'mortalitat', 'morts']",28/04/2020 22:00 (UTC),28/10/2020 23:00 (UTC),11/07/2020-15:11:09
4,Restricciones para prevenir el contagio del COVID-19 por país,Generalitat de Catalunya,Administración Autonómica,http://opendatacommons.org/licenses/by/1.0/,https://analisi.transparenciacatalunya.cat/api/views/cd4m-r23x/rows.csv,"['coronavirus', 'covid-19', 'epidemia', 'restriccions']",15/03/2020 23:00 (UTC),28/10/2020 23:00 (UTC),11/07/2020-15:11:09
5,Registro de test de COVID-19 realizados en Catalunya. Segregación por sexo i ABS,Generalitat de Catalunya,Administración Autonómica,http://opendatacommons.org/licenses/by/1.0/,https://analisi.transparenciacatalunya.cat/api/views/xuwf-dxjd/rows.csv,"['coronavirus', 'covid19', 'covid-19', 'sars-cov-2']",22/04/2020 22:00 (UTC),27/10/2020 23:00 (UTC),11/07/2020-15:11:10
6,Incidencia del COVID-19 en Cataluña,Generalitat de Catalunya,Administración Autonómica,http://opendatacommons.org/licenses/by/1.0/,https://analisi.transparenciacatalunya.cat/api/views/623z-r97q/rows.csv,"['altes hospitalàries', 'coronavirus', 'covid19', 'covid-19', 'defuncions', 'sars-cov-2']",4/06/2020 22:00 (UTC),28/10/2020 23:00 (UTC),11/07/2020-15:11:10
7,Mortalidad por todo tipo de causa en Cataluña,Generalitat de Catalunya,Administración Autonómica,http://opendatacommons.org/licenses/by/1.0/,https://analisi.transparenciacatalunya.cat/api/views/7dt9-azyt/rows.csv,"['defuncions', 'estadística', 'model', 'momo', 'mortalitat', 'relacionat covid']",7/05/2020 22:00 (UTC),28/10/2020 23:00 (UTC),11/07/2020-15:11:10
8,Registro de test de COVID-19 realizados en Catalunya. Segregación por sexo y ...,Generalitat de Catalunya,Administración Autonómica,http://opendatacommons.org/licenses/by/1.0/,https://analisi.transparenciacatalunya.cat/api/views/qwj8-xpvk/rows.csv,"['coronavirus', 'covid19', 'covid-19', 'sars-cov-2']",22/04/2020 22:00 (UTC),28/10/2020 23:00 (UTC),11/07/2020-15:11:10
9,Datos diarios de COVID-19 por comarca,Generalitat de Catalunya,Administración Autonómica,http://opendatacommons.org/licenses/by/1.0/,https://analisi.transparenciacatalunya.cat/api/views/c7sd-zy9j/rows.csv,"['coronavirus', 'covid19', 'covid-19', 'epidemiologia', 'salut']",2/08/2020 22:00 (UTC),28/10/2020 23:00 (UTC),11/07/2020-15:11:10


## prueba de lectura sobre campo url_cvs

In [7]:
df_url = _read_csv_pandas_url(df.url_csv[0])
display(df_url)

Unnamed: 0,NOM,CODI,DATA_INI,DATA_FI,RESIDENCIA,IEPG_CONFIRMAT,R0_CONFIRMAT_M,TAXA_CASOS_CONFIRMAT,CASOS_CONFIRMAT,TAXA_PCR,PCR,PERC_PCR_POSITIVES,INGRESSOS_TOTAL,INGRESSOS_CRITIC,EXITUS
0,BAIX LLOBREGAT CENTRE I FONTSANTA -L'H N,24,16/04/2020,22/04/2020,No,84.8521,0.792896,45.0712,195,447.2457,1935,10.7101,0,0,41
1,BAIX PENEDÃS,8,15/04/2020,21/04/2020,No,38.5573,0.848810,17.0344,15,265.7369,234,7.0352,0,0,4
2,LLEIDA,5,18/05/2020,24/05/2020,No,112.5250,1.847040,43.7531,158,993.8607,3589,4.3814,5,1,3
3,L'HOSPITALET SUD I EL PRAT DE LLOBREGAT,25,26/03/2020,01/04/2020,No,402.0380,0.906753,196.5155,402,498.1326,1019,48.0573,0,0,79
4,BAIX CAMP I PRIORAT,7,22/10/2020,28/10/2020,No,1194.8700,1.151370,544.1022,1090,3618.0302,7248,15.2331,72,12,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21323,CERDANYA,2,25/06/2020,01/07/2020,No,0.0000,0.000000,0.0000,0,361.9472,60,0.0000,0,0,0
21324,ARAN,4,20/08/2020,26/08/2020,Si,,,0.0000,0,0.0000,0,0.0000,0,0,0
21325,ALTA RIBAGORÃA,37,05/10/2020,11/10/2020,Si,,,0.0000,0,2439.0243,1,0.0000,0,0,0
21326,CERDANYA,2,25/03/2020,31/03/2020,Si,,,1212.1212,2,1818.1818,3,66.6667,0,0,0
