In [20]:
import re
import json
import copy
import requests
from bs4 import BeautifulSoup
from unidecode import unidecode
from dataclasses import dataclass

In [2]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [3]:
BASE_URL = "https://www.floracatalana.cat"

In [4]:
def extract_rows(url, verbose=False):
    _page = 0
    rows = []
    while True:
        _url_with_page = url + f"&page={_page}"
        if verbose:
            print("URL: " + _url_with_page)
            print(f"Page: " + str(_page))
        _page = _page + 1
        _r = requests.get(_url_with_page).content
        _soup = BeautifulSoup(_r, "html.parser")
        _table = _soup.find_all("table")
        if len(_table) > 0:
            _page_rows = _table[0].find_all_next("tbody")[0].find_all_next("tr")
            rows.extend(_page_rows)
            if verbose:
                print(f"Rows: {len(_page_rows)}\n-----------")
            continue
        break
    if verbose:
        print("***********\nTotal rows: " + str(len(rows)))
    return rows

def extract_images(url):
    _r = requests.get(url).content
    _soup = BeautifulSoup(_r, "html.parser")
    _images_container = _soup.find_all("div", class_="view-content")
    
    if len(_images_container) > 0:
        return _images_container[0].find_all("div", class_=re.compile(r"columna views-col col-\d"))
    else:
        return list()

In [26]:
family_rows = extract_rows(f"{BASE_URL}/flora/vasculars/familiesnocodi?")

In [5]:
def extract_family_info(family_row):
    td = family_row.find_all("td")
    name_cat = td[0].text.strip()
    url = BASE_URL + td[0].a["href"].strip()
    node_id = re.search(r"\d+", url)[0]
    name_latin = td[1].text.strip()
    genera_url = BASE_URL + td[2].a["href"].strip()
    code = re.search(r"Fam\d+", genera_url)[0]
    
    return {
        "name_cat": name_cat,
        "name_latin": name_latin,
        "url": url,
        "node_id": node_id,
        "code": code,
        "rank": "family",
        "children": [],
        "genera_url": genera_url,
    }

def extract_genera_info(genus_row):
    td = genus_row.find_all("td")
    name_cat = td[1].text.strip()
    name_latin = td[1].text.strip()
    code = td[0].text.strip()
    url = BASE_URL + td[1].a["href"].strip()
    node_id = re.search(r"\d+", url)[0]
    species_url = BASE_URL + td[4].a["href"].strip()
    
    return {
        "name_cat": name_cat,
        "name_latin": name_latin,
        "url": url,
        "node_id": node_id,
        "code": code,
        "rank": "genus",
        "children": [],
        "species_url": species_url
    }

def extract_species_summary(species_row):
    td = species_row.find_all("td")
    name_latin = td[2].text.strip()
    code = td[0].text.strip()
    url = BASE_URL + td[2].a["href"].strip()
    node_id = re.search(r"\d+", url)[0]
    images_url = BASE_URL + td[1].a["href"].strip()
    
    return {
        "name_cat": None,
        "name_latin": name_latin,
        "url": url,
        "node_id": node_id,
        "code": code,
        "rank": "species",
        "images": [],
        "images_url": images_url
    }

def extract_species_images(image_div):
    url = image_div.div.div.a["href"]
    tags_text = image_div.catalan_names_url_find("div", class_="views-field views-field-field-etiquetes").div.text
    if tags_text != "NO ETIQUETAT":
        tags = tags_text.split(", ")
    else:
        tags = None
    
    return {
        "url": url,
        "tags": tags
    }
    

In [32]:
taxa = []
for i, family_row in enumerate(family_rows):
    family = extract_family_info(family_row)
    print(f"{bcolors.OKBLUE}FAMILY {i+1}/{len(family_rows)} ({((i+1)/len(family_rows)*100):.1f} %){bcolors.ENDC}: {family['name_cat']}")

    genera_rows = extract_rows(family["genera_url"])
    del family["genera_url"]
    for j, genera_row in enumerate(genera_rows):
        genus = extract_genera_info(genera_row)
        print(f"|——{bcolors.OKCYAN}GENUS {j+1}/{len(genera_rows)} ({((j+1)/len(genera_rows)*100):.1f} %){bcolors.ENDC}: {genus['name_latin']}")
        
        species_rows = extract_rows(genus["species_url"])
        del genus["species_url"]
        for k, species_row in enumerate(species_rows):
            species = extract_species_summary(species_row)
            print(f"|\t|——{bcolors.OKGREEN}SPECIES {k+1}/{len(species_rows)} ({((k+1)/len(species_rows)*100):.1f} %){bcolors.ENDC}: {species['name_latin']}")
            
            images_container = extract_images(species["images_url"])
            del species["images_url"]
            for image_div in images_container:
                image = extract_species_images(image_div)
                species["images"].append(image)
            genus["children"].append(species)
        family["children"].append(genus)
    
    taxa.append(family)
    
with open('result.json', 'w') as fp:
    json.dump(taxa, fp)

[94mFAMILY 1/163 (0.6 %)[0m: Acantàcies
|——[96mGENUS 1/1 (100.0 %)[0m: Acanthus
|	|——[92mSPECIES 1/1 (100.0 %)[0m: Acanthus mollis
[94mFAMILY 2/163 (1.2 %)[0m: Aceràcies
|——[96mGENUS 1/1 (100.0 %)[0m: Acer
|	|——[92mSPECIES 1/9 (11.1 %)[0m: Acer campestre
|	|——[92mSPECIES 2/9 (22.2 %)[0m: Acer monspessulanum
|	|——[92mSPECIES 3/9 (33.3 %)[0m: Acer negundo
|	|——[92mSPECIES 4/9 (44.4 %)[0m: Acer opalus
|	|——[92mSPECIES 5/9 (55.6 %)[0m: Acer opalus subsp. granatense
|	|——[92mSPECIES 6/9 (66.7 %)[0m: Acer opalus subsp. opalus
|	|——[92mSPECIES 7/9 (77.8 %)[0m: Acer platanoides
|	|——[92mSPECIES 8/9 (88.9 %)[0m: Acer platanoides subsp. platanoides
|	|——[92mSPECIES 9/9 (100.0 %)[0m: Acer pseudoplatanus
[94mFAMILY 3/163 (1.8 %)[0m: Agavàcies
|——[96mGENUS 1/2 (50.0 %)[0m: Agave
|	|——[92mSPECIES 1/1 (100.0 %)[0m: Agave americana
|——[96mGENUS 2/2 (100.0 %)[0m: Yucca
|	|——[92mSPECIES 1/2 (50.0 %)[0m: Yucca aloifolia
|	|——[92mSPECIES 2/2 (100.0 %)[0m: Yucca glor

In [21]:
with open("result.json", 'r') as f:
    catalog = json.load(f)

In [7]:
catalog[0]["children"][0]["children"][0]["url"]

'https://www.floracatalana.cat/flora/vasculars/taxons/VTax2770'

In [141]:
@dataclass
class Species:
    _sections: list
    is_subsp: bool
    category_images: list
    distribution_map_url: str
    life_form: str
    min_size: str
    max_size: str
    nomenclature: dict
    taxonomy: dict
    subspecies: list
    flowering: dict
    frequency: str
    habitat: str
    phytosociology: str
    territory: dict
    altitude: dict

In [142]:
def safe_select(select):
    return select[0].text if len(select) > 0 else None


def safe_subscript(dict, key, return_object=None):
    return dict[key] if dict is not None else return_object


def extract_species_info(url):
    _r = requests.get(url).content
    soup = BeautifulSoup(_r, "html.parser")
    sections = get_sections(soup)
    category_images = get_category_images(soup)
    is_subspecies = is_subsp(soup)
    distribution_map_url = get_distribution_url(soup) if "Distribució" in sections else None
    description = get_description_data(soup) if "Descripció" in sections else None
    nomenclature = get_nomenclature_data(soup) if "Nomenclatura" in sections else None
    taxonomy = get_taxonomy_data(soup) if "Taxonomia" in sections else None
    subspecies = get_subspecies(soup) if "Taxonomia" in sections else list()
    flowering = get_flowering_data(soup) if "Floració" in sections else None
    ecology = get_ecology_data(soup) if "Ecologia" in sections else None
    
    return Species(
        _sections=sections,
        category_images=category_images,
        is_subsp=is_subspecies,
        distribution_map_url=distribution_map_url,
        life_form=safe_subscript(description, "life_form"),
        max_size=safe_subscript(description, "max_size"),
        min_size=safe_subscript(description, "min_size"),
        nomenclature=nomenclature,
        taxonomy=taxonomy,
        subspecies=subspecies,
        flowering=flowering,
        frequency=safe_subscript(ecology, "frequency"),
        habitat=safe_subscript(ecology, "habitat"),
        phytosociology=safe_subscript(ecology, "phytosociology"),
        territory=safe_subscript(ecology, "territory", return_object=dict()),
        altitude=safe_subscript(ecology, "altitude", return_object=dict()),
    )


def is_subsp(soup):
    subsp_text = soup.select("body > div.dialog-off-canvas-main-canvas > div > div.clearfix.main-content.region--dark-typography.region--white-background.region--no-separator > div > div > div > section > div > div > article > div > div > div > div:nth-child(1) > div:nth-child(3) > div")[0].text
    is_subsp = True if subsp_text == "SUBESPÈCIE" else False
    return is_subsp


def get_sections(soup):
    sections_select = soup.select(
        "body > div.dialog-off-canvas-main-canvas > div > div.clearfix.main-content.region--dark-typography.region--white-background.region--no-separator > div > div > div > section > div > div > article > div > div > div > div:nth-child(11) > div > div > div")
    if len(sections_select) == 0:
        sections_select = soup.select("body > div.dialog-off-canvas-main-canvas > div > div.clearfix.main-content.region--dark-typography.region--white-background.region--no-separator > div > div > div > section > div > div > article > div > div > div > div:nth-child(12) > div > div > div")

    sections = sections_select[0].find_all("summary")
    return [str(section.text) for section in sections]


def get_category_images(soup):
    _category_images_divs = soup.find_all("div", class_="field--type-image")
    _category_images = {}
    for _image_div in _category_images_divs[:-1]:
        _image = _image_div.a["href"]
        _category = _image_div.parent.parent.find(class_="field--type-link").text
        _image_extension = _image[-3:]
        if _image_extension != "png":
            _category_images[_category] = _image
        else:
            _category_images[_category] = None
    return _category_images


def get_distribution_url(soup):
    _distribution_map_url_select = soup.select('#edit-group-distribucio > div > fieldset > div > div > div.field__items > div > a')
    return _distribution_map_url_select[0]["href"] if len(_distribution_map_url_select) > 0 else None


def get_description_data(soup):
    life_form = safe_select(soup.select('#edit-group-descripcio > div > div > div:nth-child(1) > div > div.field__item'))
    min_size = safe_select(soup.select('#edit-group-descripcio > div > div > div:nth-child(1) > fieldset > div > div.field.field--name-field-mida-minima.field--type-string.field--label-inline.clearfix > div.field__item'))
    max_size = safe_select(soup.select(
        '#edit-group-descripcio > div > div > div:nth-child(1) > fieldset > div > div.field.field--name-field-mida-maxima.field--type-string.field--label-inline.clearfix > div.field__item'))
    return {"life_form": life_form, "min_size": min_size, "max_size": max_size}


def get_nomenclature_data(soup):
    scientific_name = safe_select(soup.select("#edit-group-nomenclatura > div > div > div:nth-child(1) > div > div.field__item"))
    
    synonyms_select = soup.select("#edit-group-nomenclatura > div > div > div:nth-child(2) > div > div > div.field__items")
    synonyms = synonyms_select[0].find_all(class_="field__item") if len(synonyms_select) > 0 else list()
    if synonyms is not None:
        synonyms = [synonym.text.strip() for synonym in synonyms]
    
    catalan_names_find = soup.find("div", class_='field field--name-field-nom-catala field--type-string field--label-above')
    catalan_names = catalan_names_find.find(class_="field__item").text.split(", ") if catalan_names_find is not None else list()
    
    catalan_names_url_find = soup.find("div", class_="field field--name-field-termcat field--type-link field--label-hidden field__items")
    catalan_names_url = catalan_names_url_find.div.a["href"] if catalan_names_url_find is not None else list()
    
    spanish_names_find = soup.find("div", class_='field field--name-field-nom-castella field--type-string field--label-above')
    spanish_names = spanish_names_find.find(class_="field__item").text.split(", ") if spanish_names_find is not None else list()
    
    occitan_names_find = soup.find("div", class_='field field--name-field-nom-occita field--type-string field--label-above')
    occitan_names = occitan_names_find.find(class_="field__item").text.split(", ") if occitan_names_find is not None else list()
    
    french_names_find = soup.find("div", class_='field field--name-field-nom-frances field--type-string field--label-above')
    french_names = french_names_find.find(class_="field__item").text.split(", ") if french_names_find is not None else list()
    
    english_names_find = soup.find("div", class_='field field--name-field-nom-angles field--type-string field--label-above')
    english_names = english_names_find.find(class_="field__item").text.split(", ") if english_names_find is not None else list()
    
    return {"scientific_name": scientific_name, "synonyms": synonyms, "catalan_names": catalan_names, "termcat_url": catalan_names_url, "spanish_names": spanish_names, "occitan_names": occitan_names, "english_names": english_names, "french_names": french_names}


def get_taxonomy_data(soup):
    tpl_name = safe_select(soup.select('#edit-group-relacions > div > div.field.field--name-field-nom-cientific-tpl.field--type-string.field--label-inline.clearfix > div.field__item'))
    apg_order_select = soup.select(
        '#edit-group-relacions > div > div.field.field--name-field-taxonomia-apg.field--type-entity-reference.field--label-inline.field--entity-reference-target-type-taxonomy-term.clearfix > div.field__items > div:nth-child(1)')
    apg_order = apg_order_select[0].text.split("Ord. ")[-1] if len(apg_order_select) > 0 else None

    apg_family_select = soup.select(
        '#edit-group-relacions > div > div.field.field--name-field-taxonomia-apg.field--type-entity-reference.field--label-inline.field--entity-reference-target-type-taxonomy-term.clearfix > div.field__items > div:nth-child(2)')
    apg_family = apg_family_select[0].text.split("Fam. ")[-1] if len(apg_family_select) > 0 else None

    apg_genus_select = soup.select(
        '#edit-group-relacions > div > div.field.field--name-field-taxonomia-apg.field--type-entity-reference.field--label-inline.field--entity-reference-target-type-taxonomy-term.clearfix > div.field__items > div:nth-child(3)')
    apg_genus = apg_genus_select[0].text.split("Gen. ")[-1] if len(apg_genus_select) > 0 else None
    
    return {"tpl_name": tpl_name, "apg_order": apg_order, "apg_family": apg_family, "apg_genus": apg_genus}


def get_subspecies(soup):
    subspecies = []
    is_subspecies = is_subsp(soup)
    if not is_subspecies:
        subsp_select = soup.select('#edit-group-relacions > div > div:nth-child(5) > div > div.field__items')
        subsp_a = subsp_select[0].find_all("a") if len(subsp_select) > 0 else list()
        for subsp in subsp_a:
            latin_name = subsp.text
            code = subsp["href"].split("/")[-1]
            subspecies.append({"latin_name": latin_name, "code": code})
    return subspecies


def get_flowering_data(soup):
    flowering = {}
    seasons = range(1,5)
    for season_number in seasons:
        for month_number in range(2,5):
            selector = soup.select(f"#edit-group-floracio > div > div > div:nth-child({season_number}) > div:nth-child({month_number})")[0]
            month = selector.h7.text
            if month == "Desem.":
                month = "DESEMBRE"
            elif month == "Setem.":
                month = "SETEMBRE"
            elif month == "Novem.":
                month = "NOVEMBRE"
            else:
                month = month.upper()
            flowering_value = selector.find(class_="field__item").text
            if flowering_value == "SI":
                flowering_value = True
            else:
                flowering_value = False
            flowering[month] = flowering_value
    return flowering


def get_ecology_data(soup):
    frequency = safe_select(soup.select("#edit-group-mapa-distribucio > div > div.field.field--name-field-frequencia.field--type-string.field--label-inline.clearfix > div.field__item"))
    habitat = safe_select(soup.select("#edit-group-mapa-distribucio > div > div.clearfix.text-formatted.field.field--name-field-habitat.field--type-text-long.field--label-inline > div.field__item > p"))

    phytosociology_find = soup.find("div",
                     class_="field field--name-field-adscripcio-fitosociologica field--type-entity-reference field--label-inline field--entity-reference-target-type-taxonomy-term clearfix")
    phytosociology = phytosociology_find.find(class_="field__item").text if phytosociology_find is not None else None
    
    fieldsets = soup.select("#edit-group-mapa-distribucio")[0].find_all("fieldset", class_="js-form-item form-item js-form-wrapper form-wrapper")
    [fieldset.find("div", "fieldset-wrapper") for fieldset in fieldsets]
    territory = {}
    altitude = {}
    for fieldset in fieldsets:
        fieldset_text = fieldset.span.text
        if fieldset_text == "TERRITORI":
            territory_fields_find = fieldset.find("div", "fieldset-wrapper")
            territory_fields = territory_fields_find.find_all("div") if territory_fields_find is not None else list()
            for field in territory_fields:
                label = field.find(class_="field__label")
                item = field.find(class_="field__item")
                if label is not None and item is not None:
                    item = item.text
                    label = label.text
                    if label == "Territori fisiogràfic a Catalunya":
                        label = "fisiografic_catalunya"
                    elif label == "Zones fitogeogràfiques":
                        label = "zones_fitogeografiques"
                    elif label == "Àrea de distribució general":
                        label = "distribució_general"
                    else:
                        print("New label found for TERRITORY: ", label)
                    territory[label] = item
        elif fieldset_text == "ALTITUD":
            altitude_fields_find = fieldset.find("div", "fieldset-wrapper")
            altitude_fields = altitude_fields_find.find_all("div") if altitude_fields_find is not None else list()
            for field in altitude_fields:
                label = field.find(class_="field__label")
                item = field.find(class_="field__item")
                if label is not None and item is not None:
                    label = "_".join(unidecode(label.text).lower().split()[:-1])
                    item = item.text
                    altitude[label] = int(item)
        else:
            print("New fieldset found for ECOLOGY: ", fieldset_text)
    
    return {"altitude": altitude, "territory": territory, "frequency": frequency, "habitat": habitat, "phytosociology": phytosociology}

In [40]:
families = copy.deepcopy(catalog)
for family in families:
    family["n_genera"] = len(family["children"])
    del family["children"]

In [41]:
families_genera = copy.deepcopy(catalog)
genera = []
for family in families_genera:
    family_genera = family["children"]
    for genus in family_genera:
        # FIXME: This considers subsp. as species
        genus["n_species"] = len(genus["children"])
        del genus["children"]
        genus["family_code"] = family["code"]
    genera.extend(family_genera)

In [42]:
families_species = copy.deepcopy(catalog)
species_catalog = []
for family in families_species:
    family_genera = family["children"]
    for genus in family_genera:
        genus_species =  genus["children"]
        for species in genus_species:
            species["family_code"] = family["code"]
            species["genus_code"] = genus["code"]
            species_catalog.append(species)

In [143]:
for i, species in enumerate(species_catalog):
    print(f"{bcolors.OKGREEN}{i+1}/{len(species_catalog)} ({((i+1)/len(species_catalog)*100):.2f} %){bcolors.ENDC} - {species['name_latin']}")
    species_details = extract_species_info(species["url"])
    species["is_subsp"] = species_details.is_subsp
    species["nomenclature"] = species_details.nomenclature
    species["category_images"] = species_details.category_images
    species["life_form"] = species_details.life_form
    species["size"] = {"min_size": species_details.min_size, "max_size": species_details.max_size}
    species["distribution_map_url"]: species_details.distribution_map_url
    species["taxonomy"] = species_details.taxonomy
    species["flowering"] = species_details.flowering
    species["frequency"] = species_details.frequency
    species["habitat"] = species_details.habitat
    species["phytosociology"] = species_details.phytosociology
    species["territory"] = species_details.territory
    species["altitude"] = species_details.altitude
    species["subspecies"] = species_details.subspecies
    
    print(species)
    
with open("species.json", "w", encoding="utf-8") as f:
    json.dump(species_catalog, f)

[92m1/4829 (0.02 %)[0m - Acanthus mollis
{'name_cat': None, 'name_latin': 'Acanthus mollis', 'url': 'https://www.floracatalana.cat/flora/vasculars/taxons/VTax2770', 'node_id': '2770', 'code': 'VTax2770', 'rank': 'species', 'images': [{'url': 'https://www.floracatalana.cat/flora/sites/default/files/imgvasculars/2022-08/mgc9/VTax2770.Ex1_.Img1_.jpg', 'tags': ['Infructescència', 'Calze (fruit)', 'Bec', 'Raquis (flor)']}, {'url': 'https://www.floracatalana.cat/flora/sites/default/files/imgvasculars/2022-08/mgc9/VTax2770.Ex1_.Img2_.jpg', 'tags': ['Fruit', 'Calze (fruit)', 'Bec', 'Secció (fruit)']}, {'url': 'https://www.floracatalana.cat/flora/sites/default/files/imgvasculars/2019-06/mgc1/VTax2770.ExN.Img1.jpg', 'tags': None}, {'url': 'https://www.floracatalana.cat/flora/sites/default/files/imgvasculars/2019-06/mgc1/VTax2770.ExN.Img10.jpg', 'tags': None}, {'url': 'https://www.floracatalana.cat/flora/sites/default/files/imgvasculars/2019-06/mgc1/VTax2770.ExN.Img11.jpg', 'tags': None}, {'url

KeyboardInterrupt: 

In [116]:
test_species = extract_species_info(species_catalog[1]["url"])

In [128]:
species_catalog[3]

{'name_cat': None,
 'name_latin': 'Acer negundo',
 'url': 'https://www.floracatalana.cat/flora/vasculars/taxons/VTax1451',
 'node_id': '1451',
 'code': 'VTax1451',
 'rank': 'species',
 'images': [{'url': 'https://www.floracatalana.cat/flora/sites/default/files/imgvasculars/2019-06/mgc6/VTax1451.ExN.Img1.jpg',
   'tags': None},
  {'url': 'https://www.floracatalana.cat/flora/sites/default/files/imgvasculars/2019-06/mgc6/VTax1451.ExN.Img10.jpg',
   'tags': None},
  {'url': 'https://www.floracatalana.cat/flora/sites/default/files/imgvasculars/2019-06/mgc6/VTax1451.ExN.Img11.jpg',
   'tags': None},
  {'url': 'https://www.floracatalana.cat/flora/sites/default/files/imgvasculars/2019-06/mgc6/VTax1451.ExN.Img12.jpg',
   'tags': None},
  {'url': 'https://www.floracatalana.cat/flora/sites/default/files/imgvasculars/2019-06/mgc6/VTax1451.ExN.Img13.jpg',
   'tags': None},
  {'url': 'https://www.floracatalana.cat/flora/sites/default/files/imgvasculars/2021-02/mgc9/VTax1451.ExN.Img14.jpg',
   'tags'

In [43]:
with open("families.json", "w") as f:
    json.dump(families, f)

with open("genera.json", "w") as f:
    json.dump(genera, f)

with open("species.json", "w") as f:
    json.dump(species_catalog, f)

In [83]:
r = requests.get("https://www.floracatalana.cat/flora/vasculars/taxons/VTax2770").content
# r = requests.get("https://www.floracatalana.cat/flora/vasculars/taxons/VTax1004").content
soup = BeautifulSoup(r, "html.parser")

In [79]:
subsp_div = soup.select("body > div.dialog-off-canvas-main-canvas > div > div.clearfix.main-content.region--dark-typography.region--white-background.region--no-separator > div > div > div > section > div > div > article > div > div > div > div:nth-child(1) > div:nth-child(3) > div")[0].text
contains_subsp = True if subsp_div == "ESPÈCIE AMB SUBSP." else False
is_subsp = True if subsp_div == "SUBESPÈCIE" else False

In [9]:
category_images_divs = soup.find_all("div", class_="field--type-image")
category_images = []
for image_div in category_images_divs[:-1]:
    image = image_div.a["href"]
    category = image_div.parent.parent.find(class_="field--type-link").text
    image_extension = image[-3:]
    if image_extension != "png":
        category_images.append({category: image})
    else:
        category_images.append({category: None})

In [52]:
summaries = soup.select("body > div.dialog-off-canvas-main-canvas > div > div.clearfix.main-content.region--dark-typography.region--white-background.region--no-separator > div > div > div > section > div > div > article > div > div > div > div:nth-child(11) > div > div > div")[0].find_all("summary")
summaries = [str(summary.text) for summary in summaries]

In [60]:
if "Distribució" in summaries:
    distribution_map_url_select = soup.select('#edit-group-distribucio > div > fieldset > div > div > div.field__items > div > a')
    distribution_map_url = distribution_map_url_select[0]["href"] if distribution_map_url_select is not None else None

In [12]:
if "Descripció" in summaries:
    life_form = safe_select(soup.select('#edit-group-descripcio > div > div > div:nth-child(1) > div > div.field__item'))
    
    min_size = safe_select(soup.select('#edit-group-descripcio > div > div > div:nth-child(1) > fieldset > div > div.field.field--name-field-mida-minima.field--type-string.field--label-inline.clearfix > div.field__item'))
    
    max_size = safe_select(soup.select(
        '#edit-group-descripcio > div > div > div:nth-child(1) > fieldset > div > div.field.field--name-field-mida-maxima.field--type-string.field--label-inline.clearfix > div.field__item'))

In [108]:
if "Nomenclatura" in summaries:
    scientific_name = safe_select(soup.select("#edit-group-nomenclatura > div > div > div:nth-child(1) > div > div.field__item"))
    
    synonyms_select = soup.select("#edit-group-nomenclatura > div > div > div:nth-child(2) > div > div > div.field__items")
    synonyms = synonyms_select[0].find_all(class_="field__item") if len(synonyms_select) != 0 else list()
    if synonyms is not None:
        synonyms = [synonym.text.strip() for synonym in synonyms]
    
    catalan_names_find = soup.find("div", class_='field field--name-field-nom-catala field--type-string field--label-above')
    catalan_names = catalan_names_find.find(class_="field__item").text.split(", ") if catalan_names_find is not None else None
    
    catalan_names_url_find = soup.find("div", class_="field field--name-field-termcat field--type-link field--label-hidden field__items")
    catalan_names_url = catalan_names_url_find.div.a["href"] if catalan_names_url_find is not None else None
    
    spanish_names_find = soup.find("div", class_='field field--name-field-nom-castella field--type-string field--label-above')
    spanish_names = spanish_names_find.find(class_="field__item").text.split(", ") if spanish_names_find is not None else None
    
    occitan_names_find = soup.find("div", class_='field field--name-field-nom-occita field--type-string field--label-above')
    occitan_names = occitan_names_find.find(class_="field__item").text.split(", ") if occitan_names_find is not None else None
    
    french_names_find = soup.find("div", class_='field field--name-field-nom-frances field--type-string field--label-above')
    french_names = french_names_find.find(class_="field__item").text.split(", ") if french_names_find is not None else None
    
    english_names_find = soup.find("div", class_='field field--name-field-nom-angles field--type-string field--label-above')
    english_names = english_names_find.find(class_="field__item").text.split(", ") if english_names_find is not None else None

In [56]:
if "Taxonomia" in summaries:
    tpl_name = soup.select('#edit-group-relacions > div > div.field.field--name-field-nom-cientific-tpl.field--type-string.field--label-inline.clearfix > div.field__item')[0].text
    apg_order = soup.select('#edit-group-relacions > div > div.field.field--name-field-taxonomia-apg.field--type-entity-reference.field--label-inline.field--entity-reference-target-type-taxonomy-term.clearfix > div.field__items > div:nth-child(1)')[0].text.split("Ord. ")[-1]
    apg_family = soup.select('#edit-group-relacions > div > div.field.field--name-field-taxonomia-apg.field--type-entity-reference.field--label-inline.field--entity-reference-target-type-taxonomy-term.clearfix > div.field__items > div:nth-child(2)')[0].text.split("Fam. ")[-1]
    apg_genus = soup.select('#edit-group-relacions > div > div.field.field--name-field-taxonomia-apg.field--type-entity-reference.field--label-inline.field--entity-reference-target-type-taxonomy-term.clearfix > div.field__items > div:nth-child(3)')[0].text.split("Gen. ")[-1]
    
    subspecies = []
    if contains_subsp:
        subsp_select = soup.select('#edit-group-relacions > div > div:nth-child(5) > div > div.field__items')
        subsp_a = subsp_select[0].find_all("a") if len(subsp_select) != 0 else list()
        for subsp in subsp_a:
            latin_name = subsp.text
            code = subsp["href"].split("/")[-1]
            subspecies.append({"latin_name": latin_name, "code": code})

[]


In [15]:
if "Floració" in summaries:
    flowering = {}
    seasons = range(1,5)
    for season_number in seasons:
        for month_number in range(2,5):
            selector = soup.select(f"#edit-group-floracio > div > div > div:nth-child({season_number}) > div:nth-child({month_number})")[0]
            month = selector.h7.text
            if month == "Desem.":
                month = "DESEMBRE"
            elif month == "Setem.":
                month = "SETEMBRE"
            elif month == "Novem.":
                month = "NOVEMBRE"
            else:
                month = month.upper()
            flowering_value = selector.find(class_="field__item").text
            if flowering_value == "SI":
                flowering_value = True
            else:
                flowering_value = False
            flowering[month] = flowering_value

In [76]:
if "Ecologia" in summaries:
    frequency = safe_select(soup.select("#edit-group-mapa-distribucio > div > div.field.field--name-field-frequencia.field--type-string.field--label-inline.clearfix > div.field__item"))
    habitat = safe_select(soup.select("#edit-group-mapa-distribucio > div > div.clearfix.text-formatted.field.field--name-field-habitat.field--type-text-long.field--label-inline > div.field__item > p"))
    territory_fields_select = soup.select("#edit-group-mapa-distribucio > div > div:nth-child(3) > fieldset:nth-child(1) > div")
    territory_fields = territory_fields_select[0].find_all("div") if territory_fields_select is not None else list()
    territory = {}
    for field in territory_fields:
        label = field.find(class_="field__label")
        item = field.find(class_="field__item")
        if label is not None and item is not None:
            item = item.text
            label = label.text
            if label == "Territori fisiogràfic a Catalunya":
                label = "fisiografic_catalunya"
            elif label == "Zones fitogeogràfiques":
                label = "zones_fitogeografiques"
            elif label == "Àrea de distribució general":
                label = "distribució_general"
            territory[label] = item
    
    altitude_fields_select = soup.select("#edit-group-mapa-distribucio > div > div:nth-child(3) > fieldset:nth-child(2) > div")
    altitude_fields = altitude_fields_select[0].find_all("div") if altitude_fields_select is not None else list()
    altitude = {}
    for field in altitude_fields:
        label = field.find(class_="field__label")
        item = field.find(class_="field__item")
        if label is not None and item is not None:
            label = "_".join(unidecode(label.text).lower().split()[:-1])
            item = item.text
            altitude[label] = int(item)

In [119]:
soup.find("div", class_="field field--name-field-adscripcio-fitosociologica field--type-entity-reference field--label-inline field--entity-reference-target-type-taxonomy-term clearfix").find(class_="field__item").text

'Silybo-Urticion'

In [None]:
if "Bibliografia i crèdits" in summaries:
    pass