# Scrape species data

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

# urls_to_scrape = 'https://en.wikipedia.org/wiki/List_of_culinary_fruits' 
# urls_to_scrape = 'https://en.wikipedia.org/wiki/List_of_birds_of_Denmark' 
# urls_to_scrape = 'https://en.wikipedia.org/wiki/List_of_mammals_of_Denmark'
# urls_to_scrape = 'https://en.wikipedia.org/wiki/List_of_vegetables'
# urls_to_scrape = 'https://en.wikipedia.org/wiki/List_of_leaf_vegetables'
# urls_to_scrape = 'https://en.wikipedia.org/wiki/List_of_culinary_nuts'
# urls_to_scrape = 'https://en.wikipedia.org/wiki/Plants_used_as_herbs_or_spices'
# urls_to_scrape = 'https://en.wikipedia.org/wiki/List_of_edible_seeds'
# urls_to_scrape = 'https://en.wikipedia.org/wiki/List_of_edible_flowers'
# urls_to_scrape = 'https://en.wikipedia.org/wiki/List_of_forageable_plants'
urls_to_scrape = 'https://en.wikipedia.org/wiki/Root_vegetable'

name = urls_to_scrape.split('/')[-1]
    
ranking_metric_choice = None  # Options: 'backlink', 'page_views', or None
top_n_articles = 10000

start_index = 0
end_index = 10000
tiny = (end_index - start_index) < 50
output_filename = f"{name}_tiny.json" if tiny else f"{name}.json"

article_type = 10

taxon_synonyms = {
    'tracheophytes': 'Tracheophyta',
    'tracheophyta': 'Tracheophyta',
    'seed plant': 'Spermatophyta',
    'seed plants': 'Spermatophyta',
    'angiosperms': 'Magnoliophyta',
    'angiosperm': 'Magnoliophyta',
    'magnoliophyta': 'Magnoliophyta',
    'plantae': 'Plantae',
    'plant': 'Plantae',
    'eudicots': 'Eudicotyledons',
    'eudicot': 'Eudicotyledons',
    'rosids': 'Rosids',
    'rosid': 'Rosids',
    'monocots': 'Liliopsida',
    'monocot': 'Liliopsida',
    'liliopsida': 'Liliopsida',
    'magnoliids': 'Magnoliidae',
    'magnoliid': 'Magnoliidae',
    'spermatophytes': 'Spermatophyta',
    'eukaryota': 'Eukaryota',
    'archaeplastida': 'Archaeplastida',
    'viridiplantae': 'Viridiplantae',
    'streptophyta': 'Streptophyta',
    'diaphoretickes': 'Diaphoretickes',
    'euphyllophyta': 'Euphyllophyta',
    'ericales': 'Ericales',
    'animal':'Animalia'
}

standard_ranks = [
    'superkingdom', 'kingdom', 'subkingdom', 'infrakingdom',
    'superphylum', 'phylum', 'subphylum', 
    'superclass', 'class', 'subclass', 'infraclass',
    'superorder', 'order', 'suborder', 'infraorder',
    'superfamily', 'family', 'subfamily', 'tribe', 'subtribe',
    'genus', 'subgenus', 'species', 'subspecies',
]

# Wikipedia doesn't have good data on other ranks :( These ones work for plants
standard_ranks = ['superkingdom', 'kingdom', 'subkingdom', 'infrakingdom','superdivision', 'division', 'subdivision', 'order', 'family', 'genus', 'species']#
# These work for animals, because apperently division/phylum are the same taxonomic rank but which to use depends on which kingdom???
# If I knew this I would've fixed it earlier on, but it's not worth it anymore
# standard_ranks = ['superkingdom', 'kingdom', 'subkingdom', 'infrakingdom','superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'infraclass', 'order', 'family', 'genus', 'species']

def scrape_initial_wiki_page(url, start=30, end=40, article_type=article_type):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    if article_type in [7, 8]: # The articles in this list depend on time of request
        content = soup.find('div', {'id': 'bodyContent'})
    else:
        content = soup.find('div', {'class': 'mw-parser-output'})

    title_data = []

    if article_type == 0:  # Fruits
        tables = content.find_all('table', {'class': 'wikitable'})
        for table in tables:
            rows = table.find_all('tr')
            for row in rows[1:]:
                cells = row.find_all('td')
                if len(cells) >= 2:
                    a_tag = cells[0].find('a', href=True)
                    i_tag = cells[1].find('i')
                    if a_tag and i_tag:
                        text = a_tag.text.strip()
                        href = a_tag.get('href')
                        species_name = i_tag.text.strip()
                        if text and href.startswith('/wiki/') and not href.startswith('/wiki/Special:') and ':' not in href:
                            title_data.append({
                                'title': text,
                                'link': f"https://en.wikipedia.org{href}",
                                'species': species_name
                            })

    elif article_type == 1: # Danish birds
        list_items = content.find_all('li')
        for item in list_items:
            i_tags = item.find_all('i')
            if i_tags:
                species_name = i_tags[-1].text.strip()
                title_text = item.get_text(separator=' ', strip=True)
                
                common_name_text = re.sub(rf'\b{species_name}\b', '', title_text).strip()
                common_name_text = re.sub(r'\(\s*', '(', common_name_text)
                common_name_text = re.sub(r'\s*\)', ')', common_name_text)

                a_tag = item.find('a', href=True)
                if a_tag:
                    href = a_tag.get('href')
                    if href.startswith('/wiki/') and not href.startswith('/wiki/Special:') and ':' not in href:
                        title_data.append({
                            'title': common_name_text,
                            'link': f"https://en.wikipedia.org{href}",
                            'species': species_name
                        })

    elif article_type == 2:  # Danish Mammals
        list_items = content.find_all('li')
        for item in list_items:
            a_tag = item.find('a', href=True)
            i_tag = item.find('i')
            
            if a_tag and i_tag:
                common_name_with_status = a_tag.text.strip()
                species_name = i_tag.text.strip()

                href = a_tag.get('href')
                if href.startswith('/wiki/') and not href.startswith('/wiki/Special:') and ':' not in href:
                    title_data.append({
                        'title': common_name_with_status,
                        'link': f"https://en.wikipedia.org{href}",
                        'species': species_name
                    })
    
    elif article_type == 3:  # Herbs or Spices
        table = content.find('table', {'class': 'wikitable'})
        if table:
            rows = table.find_all('tr')[1:] 
            for row in rows:
                cells = row.find_all('td')
                if len(cells) >= 2:
                    title = cells[0].get_text(strip=True)
                    species = cells[1].get_text(strip=True)
                    a_tag = cells[0].find('a', href=True)
                    href = a_tag.get('href') if a_tag else ''
                    if species and href.startswith('/wiki/') and not href.startswith('/wiki/Special:') and ':' not in href:
                        title_data.append({
                            'title': title,
                            'link': f"https://en.wikipedia.org{href}",
                            'species': species
                        })
                        
    elif article_type == 4: # (pseudo)cereals
        tables = content.find_all('table', {'class': 'wikitable'})
        cereals_table, pseudocereals_table = None, None

        for table in tables:
            title_row = table.find('tr')
            if title_row:
                th = title_row.find('th')
                if th and "Cereals" in th.get_text(strip=True):
                    cereals_table = table
                if th and "Pseudocereals" in th.get_text(strip=True):
                    pseudocereals_table = table
                if cereals_table and pseudocereals_table:
                    break


        def extract_table_data(table, has_tribe, is_pseudocereal=False):
            entries = []
            rows = table.find_all('tr')
            current_family = None
            current_tribe = None
            current_genus = None
            family_rowspan = 0
            tribe_rowspan = 0
            genus_rowspan = 0

            for index, row in enumerate(rows):
                if index < 2:
                    continue
                cells = row.find_all(['td', 'th'])
                cell_ptr = 0
                family = current_family
                tribe = current_tribe
                genus = current_genus
            
                
                if family_rowspan > 0:
                    family_rowspan -= 1
                else:
                    if cell_ptr < len(cells):
                        family_cell = cells[cell_ptr]
                        family = family_cell.get_text(strip=True)
                        if 'rowspan' in family_cell.attrs:
                            family_rowspan = int(family_cell['rowspan']) - 1
                        current_family = family
                        cell_ptr += 1
                
                if has_tribe:
                    if tribe_rowspan > 0:
                        tribe_rowspan -= 1
                    else:
                        if cell_ptr < len(cells):
                            tribe_cell = cells[cell_ptr]
                            tribe = tribe_cell.get_text(strip=True)
                            if 'rowspan' in tribe_cell.attrs:
                                tribe_rowspan = int(tribe_cell['rowspan']) - 1
                            current_tribe = tribe
                            cell_ptr += 1
                
                if genus_rowspan > 0:
                    genus_rowspan -= 1
                else:
                    if cell_ptr < len(cells):
                        genus_cell = cells[cell_ptr]
                        genus = genus_cell.get_text(strip=True)
                        if 'rowspan' in genus_cell.attrs:
                            genus_rowspan = int(genus_cell['rowspan']) - 1
                        current_genus = genus
                        cell_ptr += 1
                
                if cell_ptr < len(cells):
                    species_cell = cells[cell_ptr]
                    species_link = species_cell.find('a', href=True)
                    if species_link:
                        species = species_link.get_text(strip=True)
                        species_href = species_link['href']
                    else:
                        species = species_cell.get_text(strip=True)
                        species_href = ''
                    cell_ptr += 1
                else:
                    species = ''
                    species_href = ''
                
                if cell_ptr < len(cells):
                    seed_names_cell = cells[cell_ptr]
                    seed_names = seed_names_cell.get_text(strip=True)
                    cell_ptr += 1
                else:
                    seed_names = ''

                
                if species and seed_names and species_href:
                    if is_pseudocereal:
                        seed_names += '*'
                    entry = {
                        'title': seed_names,
                        'species': species,
                        'link': f"https://en.wikipedia.org{species_href}"
                    }
                    # print("added entry:", entry)
                    entries.append(entry)
            # print(f"Total entries extracted from table: {len(entries)}")
            return entries



        if cereals_table:
            table_data = extract_table_data(cereals_table, has_tribe=True, is_pseudocereal=False)
            title_data.extend(table_data)
        if pseudocereals_table:
            table_data = extract_table_data(pseudocereals_table, has_tribe=False, is_pseudocereal=True)
            title_data.extend(table_data)


        def extract_specific_ul_data(content):
            entries = []
            specific_p = content.find('p', string=lambda text: text and "Other grasses with edible seeds include:" in text)
            if specific_p:
                specific_ul = specific_p.find_next_sibling('ul')
                if specific_ul:
                    li_elements = specific_ul.find_all('li')
                    for index, li in enumerate(li_elements):
                        try:
                            species_link = li.find('a', href=True)
                            species_href = species_link['href'] if species_link else ''
                            species = species_link.get_text(strip=True) if species_link else ''
                            parts = li.get_text(separator=" ", strip=True).split('–')
                            seed_name = parts[-1].strip() if len(parts) >= 2 else parts[0].strip()
                            print(f"List item {index +1}: Species='{species}', Seed names='{seed_name}', Link='{species_href}'")
                            if seed_name and species and species_href and len(seed_name) < 25:
                                entry = {
                                    'title': seed_name,
                                    'species': species,
                                    'link': f"https://en.wikipedia.org{species_href}"
                                }
                                entries.append(entry)
                            else:
                                print(f"Skipping list item {index +1}: Incomplete data.")
                        except Exception as e:
                            print(f"Error processing list item {index +1}: {e}")
                            continue
            print(f"Total entries extracted from list: {len(entries)}")
            return entries

        specific_ul_data = extract_specific_ul_data(content)
        title_data.extend(specific_ul_data)

        manual_entries = [
            {
                'title': 'Wild rice',
                'species': 'Zizania palustris',
                'link': 'https://en.wikipedia.org/wiki/Wild_rice'
            },
            {
                'title': 'love-lies-bleeding*',
                'species': 'Amaranthus caudatus',
                'link': 'https://en.wikipedia.org/wiki/Amaranthus_caudatus'
            },
            {
                'title': 'prince-of-Wales feather*',
                'species': 'Amaranthus hypochondriacus',
                'link': 'https://en.wikipedia.org/wiki/Amaranthus_hypochondriacus'
            },
        ]

        title_data.extend(manual_entries)

    elif article_type == 5:  # Veggies
        tables = content.find_all('table')
        for table in tables:
            rows = table.find_all('tr')
            for row in rows[1:]: 
                cells = row.find_all('td')
                if len(cells) >= 2:
                    a_tag = cells[0].find('a', href=True)
                    species_cell = cells[1]
                    
                    species_name = species_cell.get_text(separator=" ", strip=True)
                    
                    if a_tag:
                        text = a_tag.text.strip()
                        href = a_tag.get('href')
                        if (
                            text 
                            and href.startswith('/wiki/') 
                            and not href.startswith('/wiki/Special:') 
                            and ':' not in href
                        ):
                            title_data.append({
                                'title': text,
                                'link': f"https://en.wikipedia.org{href}",
                                'species': species_name
                            })

    elif article_type == 6: # Leafy veggies
        tables = content.find_all('table', {'class': 'wikitable'})
        for table in tables:
            rows = table.find_all('tr')
            for row in rows[1:]:
                cells = row.find_all('td')
                if len(cells) >= 2:
                    a_tag = cells[0].find('a', href=True)
                    i_tag = cells[0].find('i')
                    if a_tag and i_tag:
                        species_name = a_tag.text.strip()
                        href = a_tag.get('href')
                        common_name = cells[1].text.strip()
                        if species_name and href.startswith('/wiki/') and not href.startswith('/wiki/Special:') and ':' not in href:
                            title_data.append({
                                'species': species_name,
                                'link': f"https://en.wikipedia.org{href}",
                                'title': common_name
                            })
    
    elif article_type == 7: # nuts
        list_items = content.find_all('li')
        for item in list_items:
            a_tag = item.find('a', href=True)
            if a_tag:
                href = a_tag['href']
                if href.startswith('/wiki/') and not href.startswith('/wiki/Special:') and ':' not in href:
                    common_name_text = a_tag.get_text(strip=True)
                    species_match = re.search(r'\((.*?)\)', item.text)
                    if species_match:
                        species_name = species_match.group(1).strip()
                        if "spp" not in species_name.lower():
                            title_data.append({
                                'title': common_name_text,
                                'link': f"https://en.wikipedia.org{href}",
                                'species': species_name
                            })

    elif article_type == 8:  # Flowers
        tables = content.find_all('table', {'class': 'wikitable'})
        for table in tables:
            rows = table.find_all('tr')
            for row in rows[1:]:  # Skip header row
                cells = row.find_all('td')
                if len(cells) >= 4:  # Ensure there are at least 4 columns
                    a_tag = cells[0].find('a', href=True)
                    i_tag = cells[0].find('i')
                    common_name = cells[3].text.strip()
                    if a_tag and i_tag:
                        species_name = a_tag.text.strip()
                        href = a_tag.get('href')
                        if species_name and href.startswith('/wiki/') and not href.startswith('/wiki/Special:') and ':' not in href:
                            title_data.append({
                                'species': species_name,
                                'link': f"https://en.wikipedia.org{href}",
                                'title': common_name
                            })

    elif article_type == 9: # Foregable
        tables = content.find_all('table')
        for table in tables:
            rows = table.find_all('tr')
            for row in rows[1:]: 
                print(f"Processing table with {len(rows) - 1} rows (excluding header).")

                cells = row.find_all('td')
                if len(cells) >= 4:
                    common_name = cells[2].text.strip()
                    a_tag = cells[3].find('a', href=True)
                    i_tag = cells[3].find('i')
                    if a_tag and i_tag:
                        species_name = a_tag.text.strip()
                        href = a_tag.get('href')

                        if species_name and href.startswith('/wiki/') and not href.startswith('/wiki/Special:') and ':' not in href:
                            title_data.append({
                                'species': species_name,
                                'link': f"https://en.wikipedia.org{href}",
                                'title': common_name
                            })

    elif article_type == 10: # roots
        list_items = content.find_all('li')
        for item in list_items:
            nested_ul = item.find('ul')
            if nested_ul:
                nested_lis = nested_ul.find_all('li')
                for nested_item in nested_lis:
                    a_tag = nested_item.find('a', href=True)
                    i_tag = nested_item.find('i')
                    if a_tag and i_tag:
                        species_name = a_tag.text.strip()
                        href = a_tag['href']
                        if href.startswith('/wiki/') and not href.startswith('/wiki/Special:') and ':' not in href and "spp" not in species_name.lower():
                            common_name_match = re.search(r'\((.*?)\)', nested_item.text)
                            if common_name_match:
                                common_name = common_name_match.group(1).strip()
                                title_data.append({
                                    'title': common_name,
                                    'link': f"https://en.wikipedia.org{href}",
                                    'species': species_name
                                })

    print(title_data[start:end])
    return title_data[start:end]

# Only some articles have QID, and I get different kind of data through direct scraping and QID. The data will be combined using propagate_taxonomic_info()
def get_qid_from_title(article_title, lang="en"):
    url = f"https://{lang}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "titles": article_title,
        "prop": "pageprops",
        "format": "json",
        "ppprop": "wikibase_item"
    }
    response = requests.get(url, params=params)
    data = response.json()
    if 'query' in data and 'pages' in data['query']:
        pages = data['query']['pages']
        for page_id, page in pages.items():
            if "pageprops" in page and "wikibase_item" in page["pageprops"]:
                return page["pageprops"]["wikibase_item"]
    return None

def search_wikidata(query):
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "search": query,
        "language": "en",
        "format": "json",
        "limit": 1,
        "type": "item"
    }
    response = requests.get(url, params=params)
    data = response.json()
    if data['search']:
        return data['search'][0]['id']
    else:
        return None

def get_taxonomy_data(qid):
    url = "https://query.wikidata.org/sparql"
    query = """
    SELECT ?parent ?parentLabel ?taxonRank ?taxonRankLabel
    WHERE {
      wd:%s wdt:P171* ?parent .
      OPTIONAL { ?parent wdt:P105 ?taxonRank . }
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
    }
    """ % qid
    headers = {
        'Accept': 'application/sparql-results+json',
        'User-Agent': 'Mozilla/5.0'
    }
    response = requests.get(url, headers=headers, params={'query': query})
    data = response.json()
    taxonomy = {}
    for entry in data['results']['bindings']:
        rank = entry.get('taxonRankLabel', {}).get('value', '').lower()
        taxon = entry['parentLabel']['value']
        if rank and rank in standard_ranks:
            taxon_standard = taxon_synonyms.get(taxon.lower(), taxon)
            taxonomy[rank] = taxon_standard
    return taxonomy

def get_taxonomic_data_from_wikipedia(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    taxobox = soup.find('table', {'class': 'infobox biota'})
    if taxobox:
        taxonomic_data = {}
        rows = taxobox.find_all('tr')
        clade_count = 1
        for row in rows:
            tds = row.find_all('td')
            if len(tds) == 2:
                rank = tds[0].get_text(strip=True).replace(":", "").lower()
                value = tds[1].get_text(strip=True)
                rank = rank.strip().lower()
                if "clade" in rank:
                    taxonomic_data[f"clade_{clade_count}"] = value
                    clade_count += 1
                elif rank:
                    value_standard = taxon_synonyms.get(value.lower(), value)
                    taxonomic_data[rank] = value_standard
        taxonomic_data['title'] = soup.find('h1', {'id': 'firstHeading'}).get_text(strip=True)
        taxonomic_data['link'] = url
        taxonomic_data['QID'] = None
        return taxonomic_data
    else:
        return {}

def get_image_and_text_from_wikipedia(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    taxobox = soup.find('table', {'class': 'infobox biota'})
    img_link = None
    if taxobox:
        img = taxobox.find('img')
        if img:
            src = img.get('src')
            if src:
                if src.startswith('//'):
                    img_link = 'https:' + src
                else:
                    img_link = src
    if not img_link:
        img = soup.find('img')
        if img:
            src = img.get('src')
            if src:
                if src.startswith('//'):
                    img_link = 'https:' + src
                else:
                    img_link = src
    text = None
    content = soup.find('div', {'class': 'mw-parser-output'})
    if content:
        paragraphs = content.find_all('p', recursive=False)
        for paragraph in paragraphs:
            for sup in paragraph.find_all('sup'):
                sup.decompose()
            text_content = paragraph.get_text(separator=' ', strip=True)
            if text_content:
                text_content = re.sub(r'\s+([.,!?;:])', r'\1', text_content)
                text_content = re.sub(r'\s{2,}', ' ', text_content)
                text = text_content
                break

    return img_link, text if text else "No description available"

def get_pageviews_bulk(titles):
    url = "https://en.wikipedia.org/w/api.php"
    pageviews = {}
    for i in range(0, len(titles), 50):
        batch = titles[i:i+50]
        params = {
            'action': 'query',
            'titles': '|'.join(batch),
            'prop': 'pageviews',
            'format': 'json'
        }
        response = requests.get(url, params=params)
        data = response.json()
        if 'query' in data and 'pages' in data['query']:
            pages = data['query']['pages']
            for page_id in pages:
                page = pages[page_id]
                title = page['title']
                views = page.get('pageviews', {})
                total_views = sum(v for v in views.values() if v)
                pageviews[title] = total_views
    return pageviews

def get_backlinks_count(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'list': 'backlinks',
        'bltitle': title,
        'bllimit': 'max',
        'format': 'json'
    }
    total_backlinks = 0
    while True:
        response = requests.get(url, params=params)
        data = response.json()
        backlinks = data.get('query', {}).get('backlinks', [])
        total_backlinks += len(backlinks)
        if 'continue' in data:
            params.update(data['continue'])
        else:
            break
    return total_backlinks

def get_backlinks_counts(titles):
    backlinks_counts = {}
    for i, title in enumerate(titles):
        count = get_backlinks_count(title)
        backlinks_counts[title] = count
    return backlinks_counts

# Fill in blanks, fx if two species has same genus, I assume they have same family and so on (this isn't 100% correct, but it gives me way more data)
def propagate_taxonomic_info(df, taxonomic_ranks):
    for current_rank, parent_rank in zip(taxonomic_ranks[::-1], taxonomic_ranks[-2::-1]):
        unique_items = df[current_rank].dropna().unique()
        
        for item in unique_items:
            most_common_parent = (
                df[df[current_rank] == item][parent_rank]
                .dropna()
                .mode()
            )
            most_common_parent = most_common_parent.iloc[0] if not most_common_parent.empty else None
            print(f"Most common {parent_rank} for {current_rank} {item}: {most_common_parent}")
            
            df.loc[df[current_rank] == item, parent_rank] = most_common_parent
    
    return df

def extract_species(name):
    parts = re.split(r'[.\s]+', name)
    if len(parts) > 1:
        return parts[1]
    return name
    
def filter_img_links(link):
    if '.svg' in link:
        return ''
    else:
        return link

def process_links(urls, ranking_metric=None, start=0, end=10, top_n=100, output_file="taxonomy_data.json"):
    if isinstance(urls, str):
        urls = [urls]
    all_title_data = []
    for url in urls:
        title_data = scrape_initial_wiki_page(url, start=start, end=end)
        all_title_data.extend(title_data)
    all_title_data = [dict(t) for t in {tuple(d.items()) for d in all_title_data}]
    titles = [item['title'] for item in all_title_data]
    if ranking_metric == 'page_views':
        ranking_dict = get_pageviews_bulk(titles)
        sort_key = 'pageviews'
    elif ranking_metric == 'backlink':
        ranking_dict = get_backlinks_counts(titles)
        sort_key = 'backlinks'
    else:
        sort_key = None
    if sort_key:
        for item in all_title_data:
            item[sort_key] = ranking_dict.get(item['title'], 0)
        all_title_data = sorted(all_title_data, key=lambda x: x.get(sort_key, 0), reverse=True)
    if isinstance(top_n, float):
        num_articles = int(top_n * len(all_title_data))
    else:
        num_articles = int(top_n)
    all_title_data = all_title_data[:num_articles]
    results = []
    for item in all_title_data:
        title = item['title']
        link = item['link']
        species = item.get('species', None)
        qid = None
        if species:
            qid = get_qid_from_title(species)
            if not qid:
                qid = search_wikidata(species)
        else:
            qid = get_qid_from_title(title)
            if not qid:
                qid = search_wikidata(title)
        taxonomy = None
        if qid:
            taxonomy = get_taxonomy_data(qid)
        result_dict = {
            'title': title,
            'link': link,
            'QID': qid if qid else None
        }
        if species:
            result_dict['species'] = species
        if sort_key:
            result_dict[sort_key] = item.get(sort_key, 0)
        if taxonomy:
            for rank, taxon in taxonomy.items():
                if rank not in result_dict:
                    result_dict[rank] = taxon
        else:
            taxonomy = get_taxonomic_data_from_wikipedia(link)
            if taxonomy:
                for key, value in taxonomy.items():
                    if key not in result_dict:
                        result_dict[key] = value
        img_link, text = get_image_and_text_from_wikipedia(link)
        result_dict['img_link'] = filter_img_links(img_link) if img_link else ''
        result_dict['text'] = text if text else "No description available"
        results.append(result_dict)
    df = pd.DataFrame(results)
    df = df.loc[:, ~df.columns.duplicated()]
    columns_order = ['title', 'link', 'QID', 'img_link', 'text']
    if sort_key:
        columns_order.append(sort_key)
    columns_order += standard_ranks
    columns_order = [col for col in columns_order if col in df.columns]
    df = df[columns_order]
    #taxonomic_ranks = ['superkingdom', 'kingdom', 'subkingdom', 'infrakingdom','superdivision', 'division', 'subdivision', 'order', 'family', 'genus']
    #taxonomic_ranks = ['superkingdom', 'kingdom', 'subkingdom', 'infrakingdom','superphylum', 'phylum', 'subphylum', 'order', 'family', 'genus']
    df = propagate_taxonomic_info(df, standard_ranks)
    df = df.dropna(subset=['genus', 'species'])
    df['species_cleaned'] = df['species'].apply(extract_species)
    df.drop_duplicates(subset=['species_cleaned', 'title'], inplace=True)
    df.fillna('', inplace=True)
    df['species_info'] = df.apply(lambda row: {
        'species_name': row['species_cleaned'],
        'title': row['title'],
        'link': row['link'],
        'img_link': row['img_link'],
        'text': row['text']
    }, axis=1)
    df.to_json(output_file, orient='records', lines=True)
    return df

final_df = process_links(
    urls=urls_to_scrape,
    ranking_metric=ranking_metric_choice,
    start=start_index,
    end=end_index,
    top_n=top_n_articles,
    output_file=output_filename
)
print(len(final_df))

final_df.head(5)


# Scrape the other taxons

In [125]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd

# urls_to_scrape = 'https://en.wikipedia.org/wiki/List_of_edible_seeds'
name = urls_to_scrape.split('/')[-1]
tiny = False
original_data_filename = f"{name}.json" if not tiny else f"{name}_tiny.json"
scraped_data_filename = f"{name}_scraped.json"

def filter_img_links(link):
    return '' if '.svg' in link else link

def get_image_and_text_from_wikipedia(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    taxobox = soup.find('table', {'class': 'infobox biota'})
    img_link = ''
    if taxobox:
        img = taxobox.find('img')
        if img and img.get('src'):
            img_link = 'https:' + img['src'] if img['src'].startswith('//') else img['src']
    if not img_link:
        img = soup.find('img')
        if img and img.get('src'):
            img_link = 'https:' + img['src'] if img['src'].startswith('//') else img['src']
    img_link = filter_img_links(img_link)
    
    text = "No description available"
    content = soup.find('div', {'class': 'mw-parser-output'})
    if content:
        paragraphs = content.find_all('p', recursive=False)
        for paragraph in paragraphs:
            for sup in paragraph.find_all('sup'):
                sup.decompose()
            text_content = paragraph.get_text(separator=' ', strip=True)
            if text_content:
                text_content = re.sub(r'\s+([.,!?;:])', r'\1', text_content)
                text_content = re.sub(r'\s{2,}', ' ', text_content)
                text = text_content
                break
    return img_link, text

def get_taxon_data(taxon_name):
    link = f"https://en.wikipedia.org/wiki/{taxon_name.replace(' ', '_')}"
    img_link, text = get_image_and_text_from_wikipedia(link)
    return {
        'name': taxon_name,
        'link': link,
        'img_link': img_link,
        'text': text
    }

df_original = pd.read_json(original_data_filename, orient='records', lines=True)
df_original.fillna('', inplace=True)


taxa_columns = ['kingdom', 'infrakingdom', 'superdivision', 'division', 'subdivision', 'order', 'family', 'genus']
# taxa_columns = ['superkingdom', 'kingdom', 'subkingdom', 'infrakingdom','superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'infraclass', 'order', 'family', 'genus', 'species']
unique_taxa = set()
for col in taxa_columns:
    unique_taxa.update(df_original[col].dropna().unique())

scraped_data = []
for taxon in sorted(unique_taxa):
    taxon = str(taxon).strip()
    if taxon:
        data = get_taxon_data(taxon)
        scraped_data.append(data)

df_scraped = pd.DataFrame(scraped_data)
df_scraped.to_json(scraped_data_filename, orient='records', lines=True)


# Make plot

In [None]:
import re
import pandas as pd
from bokeh.models import ColumnDataSource, HoverTool, TapTool, CustomJS, WheelZoomTool, ResetTool, PanTool
from bokeh.plotting import figure, show, output_file, output_notebook
from bokeh.io import curdoc

tab_title = 'Edible Plants'
# urls_to_scrape = 'https://en.wikipedia.org/wiki/Plants_used_as_herbs_or_spices'
name = urls_to_scrape.split('/')[-1]
name = 'edible_plants'
original_data_filename = f"{name}.json" if not tiny else f"{name}_tiny.json"
scraped_data_filename = f"{name}_scraped.json"
output_plot_filename = f"{name}.html" if not tiny else f"{name}_tiny.html"

rankings = [
    'kingdom',
    'division',
    'order',
    'family',
    'genus',
    'species'
]

root_taxon_name = 'Plantae'
root_taxon_rank = 'kingdom'
plot_height = 60000
y_offset_internal = 0.5
y_offset_multiplier = 0.07
x_range_offset = 2
y_range_offset_max = 12
y_range_offset_min = 1
text_multiplier = 1
min_text = 15

horizontal_lines_manual = [
]

vertical_lines_manual = [
]

nodes_manual = [
    [0.1, 2756, 4, "edge", "Go Back", "images/taxotree4.png", "", "https://kochjar.me/trees"],
    [3, 2756, 26, "edge", "Edible Plants", "images/spongebob.png", "ive actually heard ppl sad this... this is sadly not a strawman", "https://www.youtube.com/watch?v=r7iT5GM3MgM"],
    [3, 2752, 14, "edge", "This is tree containing species from all other edible plant trees\nFor the sources check out the other trees.\nClick \"Go Back\" to see them\nIf you encounter lag, try using Chromium.", "images/large_tree.jpg", "I like big trees, they can't runaway from hugs >:3", "https://en.wikipedia.org/wiki/General_Sherman_(tree)"],
]



class TreeNode:
    def __init__(self, name, depth, data=None):
        self.name = name
        self.depth = depth
        self.data = data
        self.children = []
        self.size = 0
        self.x = 0
        self.y = 0

def extract_species(name):
    parts = re.split(r'[.\s]+', name)
    return parts[1] if len(parts) > 1 else name

def filter_img_links(link):
    return '' if '.svg' in link else link

def make_tree(rank_name, rank_level, df_original, df_scraped):
    if rank_level >= len(rankings):
        return None
    if rankings[rank_level] == 'genus':
        species_info_list = sorted(
            df_original[df_original["genus"] == rank_name]["species_info"].tolist(),
            key=lambda x: x['species_name'],
            reverse=True
        )
        if not species_info_list:
            return None
        scraped_row = df_scraped[df_scraped['name'] == rank_name]
        if not scraped_row.empty:
            node_data = {
                'name': rank_name,
                'link': scraped_row.iloc[0]['link'],
                'img_link': scraped_row.iloc[0]['img_link'],
                'text': scraped_row.iloc[0]['text']
            }
        else:
            node_data = {}
        node = TreeNode(rank_name, rank_level, node_data)
        for species_info in species_info_list:
            node.children.append(TreeNode(
                species_info['species_name'],
                rank_level + 1,
                species_info
            ))
    else:
        scraped_row = df_scraped[df_scraped['name'] == rank_name]
        if not scraped_row.empty:
            node_data = {
                'name': rank_name,
                'link': scraped_row.iloc[0]['link'],
                'img_link': scraped_row.iloc[0]['img_link'],
                'text': scraped_row.iloc[0]['text']
            }
        else:
            node_data = {}
        node = TreeNode(rank_name, rank_level, node_data)
        next_rank = rankings[rank_level + 1]
        subranks = sorted(
            df_original[df_original[rankings[rank_level]] == rank_name][next_rank].dropna().unique().tolist(),
            reverse=True
        )
        for subrank_name in subranks:
            if subrank_name:
                child_node = make_tree(subrank_name, rank_level + 1, df_original, df_scraped)
                if child_node is not None:
                    node.children.append(child_node)
    if not node.children:
        return None
    return node

def assign_coordinates(tree):
    def tree_size(node):
        if not node.children:
            return 1
        return sum(tree_size(child) for child in node.children) + 1

    def tree_depth(node):
        if not node.children:
            return 0
        return 1 + max(tree_depth(child) for child in node.children)

    def tree_leaves(node):
        if not node.children:
            return [node]
        leaves = []
        for child in node.children:
            leaves.extend(tree_leaves(child))
        return leaves

    def tree_at_depth(node, depth):
        if depth == 0:
            return [node]
        elif not node.children:
            return []
        nodes = []
        for child in node.children:
            nodes.extend(tree_at_depth(child, depth - 1))
        return nodes

    def leaf_coords(node, current_y, coords):
        if not node.children:
            coords.append((node, current_y[0]))
            current_y[0] += 1
        else:
            for child in node.children:
                leaf_coords(child, current_y, coords)
            current_y[0] += 1

    def node_coords(node, coords):
        if node.children:
            children_coords = [child.y for child in node.children]
            node.y = sum(children_coords) / len(children_coords)
            coords.append((node, node.x, node.y))

    current_y = [0]
    depth = tree_depth(tree)

    leaf_coords_list = []
    leaf_coords(tree, current_y, leaf_coords_list)

    for node, y in leaf_coords_list:
        node.x = depth
        node.y = y

    for d in range(depth - 1, -1, -1):
        nodes_at_depth = tree_at_depth(tree, d)
        for node in nodes_at_depth:
            node.x = d
            node_coords(node, [])

    return tree

def calculate_subtree_sizes(node):
    if not node.children:
        node.size = 1
    else:
        node.size = 0
        for child in node.children:
            calculate_subtree_sizes(child)
            node.size += child.size

def collect_nodes_edges(node, nodes_list, edges_list):
    nodes_list.append(node)
    for child in node.children:
        edges_list.append((node, child))
        collect_nodes_edges(child, nodes_list, edges_list)

def draw_tree_bokeh(nodes_list, edges_list, manual_lines, manual_nodes):
    placeholder_image = "https://via.placeholder.com/150"
    max_depth = max(node.x for node in nodes_list + [
        TreeNode(n[3], n[0], {'link': n[7], 'img_link': n[5], 'text': n[6]}) for n in manual_nodes
    ])
    
    p = figure(
        title="",
        background_fill_color='black',
        border_fill_color='black',
        outline_line_color='black',
        sizing_mode="stretch_width",
        height=plot_height,
        x_range=(0, max_depth + x_range_offset),
        y_range=(-y_range_offset_min, max(node.y for node in nodes_list) + y_range_offset_max)
    )
    
    p.toolbar_location = "right"
    p.toolbar_sticky = True
    
    xs_lines = []
    ys_lines = []
    line_widths = []
    for parent, child in edges_list:
        size_multiplier = (max_depth - parent.x)
        line_thickness = 3 * size_multiplier + 2
        xs_lines.append([parent.x - 1/450 * (3*(max_depth - parent.x) + 2), child.x])
        ys_lines.append([parent.y, parent.y])
        line_widths.append(line_thickness)
        xs_lines.append([child.x, child.x])
        ys_lines.append([parent.y, child.y])
        line_widths.append(line_thickness)
        if not child.children:
            xs_lines.append([child.x, child.x + 0.5])
            ys_lines.append([child.y, child.y])
            line_widths.append(2)
    
    labels_data_internal = {
        'x': [], 'y': [], 'text': [], 'link': [], 'img': [],
        'text_size': [], 'original_text_size': [], 'depth': [], 'wikitext': [], 'text_align': []
    }
    labels_data_leaf = {
        'x': [], 'y': [], 'text': [], 'link': [], 'img': [],
        'text_size': [], 'original_text_size': [], 'depth': [], 'wikitext': [], 'text_align': []
    }
    
    for node in nodes_list:
        size_multiplier = (max_depth - node.x)
        text_size = size_multiplier * text_multiplier + min_text
        if node.children:
            labels_data_internal['x'].append(node.x + 0.5)
            labels_data_internal['y'].append(node.y + y_offset_internal + size_multiplier * y_offset_multiplier)
            labels_data_internal['depth'].append(node.x)
            labels_data_internal['text_size'].append(f"{text_size}px")
            labels_data_internal['original_text_size'].append(f"{text_size}px")
            labels_data_internal['text'].append(node.name)
            labels_data_internal['link'].append(node.data.get('link', ''))
            labels_data_internal['img'].append(node.data.get('img_link', placeholder_image))
            labels_data_internal['wikitext'].append(node.data.get('text', ""))
            labels_data_internal['text_align'].append('center')
        elif node.x == max_depth:
            species_info = node.data
            labels_data_leaf['x'].append(node.x + 0.52)
            labels_data_leaf['y'].append(node.y)
            labels_data_leaf['depth'].append(node.x)
            labels_data_leaf['text_size'].append(f"{text_size}px")
            labels_data_leaf['original_text_size'].append(f"{text_size}px")
            labels_data_leaf['text'].append(f"{species_info.get('species_name', '')} ── {species_info.get('title', '')}")
            labels_data_leaf['link'].append(species_info.get('link', ''))
            labels_data_leaf['img'].append(species_info.get('img_link', placeholder_image))
            labels_data_leaf['wikitext'].append(species_info.get('text', ""))
            labels_data_leaf['text_align'].append('left') 
     
    manual_with_hover = [n for n in manual_nodes if n[5] or n[6]]
    manual_without_hover = [n for n in manual_nodes if not n[5] and not n[6]]
    
    labels_manual_with_hover = {
        'x': [], 'y': [], 'text': [], 'link': [], 'img': [],
        'text_size': [], 'original_text_size': [], 'depth': [], 'wikitext': [], 'text_align': []
    }
    labels_manual_without_hover = {
        'x': [], 'y': [], 'text': [], 'link': [], 'img': [],
        'text_size': [], 'original_text_size': [], 'depth': [], 'wikitext': [], 'text_align': []
    }
    
    for node in manual_with_hover:
        x, y, base_size, type_, text, hover_img, hover_text, link = node
        size_multiplier = (max_depth - x)
        text_size = base_size + size_multiplier * 4
        labels_manual_with_hover['x'].append(x)
        if type_ == "node":
            labels_manual_with_hover['y'].append(y + (max_depth - x) * y_offset_internal)
        else:
            labels_manual_with_hover['y'].append(y)
        labels_manual_with_hover['depth'].append(x)
        labels_manual_with_hover['text_size'].append(f"{text_size}px")
        labels_manual_with_hover['original_text_size'].append(f"{text_size}px")
        labels_manual_with_hover['text'].append(text)
        labels_manual_with_hover['link'].append(link)
        labels_manual_with_hover['img'].append(hover_img if hover_img else placeholder_image)
        labels_manual_with_hover['wikitext'].append(hover_text)
        if type_ == "edge":
            labels_manual_with_hover['text_align'].append('left')
        else:
            labels_manual_with_hover['text_align'].append('center')
    
    for node in manual_without_hover:
        x, y, base_size, type_, text, hover_img, hover_text, link = node
        size_multiplier = (max_depth - x)
        text_size = base_size + size_multiplier * 4
        labels_manual_without_hover['x'].append(x)
        if type_ == "node":
            labels_manual_without_hover['y'].append(y + (max_depth - x) * y_offset_internal)
        else:
            labels_manual_without_hover['y'].append(y)
        labels_manual_without_hover['depth'].append(x)
        labels_manual_without_hover['text_size'].append(f"{text_size}px")
        labels_manual_without_hover['original_text_size'].append(f"{text_size}px")
        labels_manual_without_hover['text'].append(text)
        labels_manual_without_hover['link'].append(link)
        labels_manual_without_hover['img'].append('')
        labels_manual_without_hover['wikitext'].append('')
        if type_ == "edge":
            labels_manual_without_hover['text_align'].append('left')
        else:
            labels_manual_without_hover['text_align'].append('center')

    labels_source_internal = ColumnDataSource(data=labels_data_internal)
    labels_source_leaf = ColumnDataSource(data=labels_data_leaf)
    labels_manual_with_hover_source = ColumnDataSource(data=labels_manual_with_hover)
    labels_manual_without_hover_source = ColumnDataSource(data=labels_manual_without_hover)

    lines_source = ColumnDataSource(data=dict(
        xs=xs_lines,
        ys=ys_lines,
        line_widths=line_widths,
        original_line_widths=line_widths.copy()
    ))
    p.multi_line(xs='xs', ys='ys', source=lines_source, line_color="white", line_width='line_widths')

    internal_text_renderer = p.text(
        x='x',
        y='y',
        text='text',
        source=labels_source_internal,
        text_color='white',
        text_font_size='text_size',
        text_baseline='middle',
        text_align='text_align'
    )
    
    leaf_text_renderer = p.text(
        x='x',
        y='y',
        text='text',
        source=labels_source_leaf,
        text_color='white',
        text_font_size='text_size',
        text_baseline='middle',
        text_align='text_align'
    )
    
    manual_with_hover_renderer = p.text(
        x='x',
        y='y',
        text='text',
        source=labels_manual_with_hover_source,
        text_color='white',
        text_font_size='text_size',
        text_baseline='middle',
        text_align='text_align'
    )
    
    manual_without_hover_renderer = p.text(
        x='x',
        y='y',
        text='text',
        source=labels_manual_without_hover_source,
        text_color='white',
        text_font_size='text_size',
        text_baseline='middle',
        text_align='text_align'
    )
    
    TOOLTIPS = """
    <div style="width: 350px; background-color: rgba(221, 221, 221, 1); padding: 10px;">
        <div style="text-align: center;">
            <img src="@img" alt=""
                style="
                    max-height: 300px; 
                    max-width: 300px; 
                    height: auto; 
                    width: auto; 
                    display: block; 
                    margin: 0 auto; 
                    @{img == '' ? 'display:none;' : ''};"
                border="0">
        </div>
        <div style="text-align: center; @{wikitext == '' ? 'display:none;' : ''}">
            <span style="font-size: 16px; color: #000000;">@wikitext</span>
        </div>
    </div>
    """
    
    hover_tool_internal = HoverTool(
        tooltips=TOOLTIPS,
        renderers=[internal_text_renderer, leaf_text_renderer],
        point_policy="follow_mouse"
    )
    
    hover_tool_manual = HoverTool(
        tooltips=TOOLTIPS,
        renderers=[manual_with_hover_renderer],
        point_policy="follow_mouse"
    )
    
    p.add_tools(hover_tool_internal, hover_tool_manual)
    
    callback = CustomJS(args=dict(
        source_internal=labels_source_internal, 
        source_leaf=labels_source_leaf,
        source_manual_with_hover=labels_manual_with_hover_source,
        source_manual_without_hover=labels_manual_without_hover_source
    ), code="""
        const selected_internal = source_internal.selected.indices;
        const selected_leaf = source_leaf.selected.indices;
        const selected_manual_with = source_manual_with_hover.selected.indices;
        const selected_manual_without = source_manual_without_hover.selected.indices;

        if (selected_internal.length > 0) {
            const index = selected_internal[0];
            const url = source_internal.data['link'][index];
            if (url) {
                window.open(url, "_blank");
            }
        }
        if (selected_leaf.length > 0) {
            const index = selected_leaf[0];
            const url = source_leaf.data['link'][index];
            if (url) {
                window.open(url, "_blank");
            }
        }
        if (selected_manual_with.length > 0) {
            const index = selected_manual_with[0];
            const url = source_manual_with_hover.data['link'][index];
            if (url) {
                window.open(url, "_blank");
            }
        }
        if (selected_manual_without.length > 0) {
            const index = selected_manual_without[0];
            const url = source_manual_without_hover.data['link'][index];
            if (url) {
                window.open(url, "_blank");
            }
        }

        // Clear selection
        source_internal.selected.indices = [];
        source_leaf.selected.indices = [];
        source_manual_with_hover.selected.indices = [];
        source_manual_without_hover.selected.indices = [];
    """)
    
    tap_tool = TapTool(callback=callback, renderers=[
        internal_text_renderer, 
        leaf_text_renderer, 
        manual_with_hover_renderer, 
        manual_without_hover_renderer
    ])
    p.add_tools(tap_tool)
    

    initial_x_range = p.x_range.end - p.x_range.start

    zoom_callback = CustomJS(args=dict(
    labels_source_internal=labels_source_internal,
    labels_source_leaf=labels_source_leaf,
    labels_manual_with_hover_source=labels_manual_with_hover_source,
    labels_manual_without_hover_source=labels_manual_without_hover_source,
    lines_source=lines_source,
    plot=p,
    initial_x_range=initial_x_range
), code="""
    const x_range = plot.x_range.end - plot.x_range.start;

    // Scale Text Sizes for Internal Nodes
    const labels_data_internal = labels_source_internal.data;
    const depths_internal = labels_data_internal['depth'];
    const n_labels_internal = depths_internal.length;
    for (let i = 0; i < n_labels_internal; i++) {
        const depth = depths_internal[i];
        const threshold = 35 - depth * 3;
        labels_data_internal['text_size'][i] = (x_range > threshold) ? '0px' : labels_data_internal['original_text_size'][i];
    }
    labels_source_internal.change.emit();

    // Scale Text Sizes for Leaf Nodes
    const labels_data_leaf = labels_source_leaf.data;
    const depths_leaf = labels_data_leaf['depth'];
    const n_labels_leaf = depths_leaf.length;
    for (let i = 0; i < n_labels_leaf; i++) {
        const depth = depths_leaf[i];
        const threshold = 35 - depth * 3;
        labels_data_leaf['text_size'][i] = (x_range > threshold) ? '0px' : labels_data_leaf['original_text_size'][i];
    }
    labels_source_leaf.change.emit();

    // Scale Text Sizes for Manual Nodes with Hover
    const labels_manual_with = labels_manual_with_hover_source.data;
    const depths_manual_with = labels_manual_with['depth'];
    const n_manual_with = depths_manual_with.length;
    for (let i = 0; i < n_manual_with; i++) {
        const depth = depths_manual_with[i];
        const threshold = 35;
        labels_manual_with['text_size'][i] = (x_range > threshold) ? '0px' : labels_manual_with['original_text_size'][i];
    }
    labels_manual_with_hover_source.change.emit();

    // Scale Text Sizes for Manual Nodes without Hover
    const labels_manual_without = labels_manual_without_hover_source.data;
    const depths_manual_without = labels_manual_without['depth'];
    const n_manual_without = depths_manual_without.length;
    for (let i = 0; i < n_manual_without; i++) {
        const depth = depths_manual_without[i];
        const threshold = 35;
        labels_manual_without['text_size'][i] = (x_range > threshold) ? '0px' : labels_manual_without['original_text_size'][i];
    }
    labels_manual_without_hover_source.change.emit();

    // Scale Line Widths
    const lines_data = lines_source.data;
    const line_widths = lines_data['line_widths'];
    const original_line_widths = lines_data['original_line_widths'];
    const n_lines = line_widths.length;

    // Calculate scaling factor
    const scaling_factor = initial_x_range / x_range;
    const min_scaling_factor = 0.1;
    const max_scaling_factor = 1;

    // Apply scaling factor to line widths
    const line_scaling_factor = Math.max(min_scaling_factor, Math.min(max_scaling_factor, scaling_factor));
    for (let i = 0; i < n_lines; i++) {
        line_widths[i] = original_line_widths[i] * line_scaling_factor;
    }
    lines_source.change.emit();
""")

    p.x_range.js_on_change('start', zoom_callback)
    p.x_range.js_on_change('end', zoom_callback)
    p.y_range.js_on_change('start', zoom_callback)
    p.y_range.js_on_change('end', zoom_callback)

    p.grid.visible = False
    p.axis.visible = False
    
    wheel_zoom_tool = WheelZoomTool()
    reset_tool = ResetTool()
    pan_tool = PanTool()
    
    p.toolbar.tools = [
        pan_tool, 
        wheel_zoom_tool, 
        tap_tool, 
        hover_tool_internal, 
        hover_tool_manual, 
        reset_tool
    ]
    # p.toolbar.active_scroll = wheel_zoom_tool
    p.toolbar.active_drag = pan_tool
    p.toolbar.logo = None
    
    manual_lines_source = ColumnDataSource(data=dict(
        xs=[[line[0][0], line[1][0]] for line in horizontal_lines_manual + vertical_lines_manual],
        ys=[[line[0][1], line[1][1]] for line in horizontal_lines_manual + vertical_lines_manual],
        line_widths=[3 * (max_depth - line[0][0]) + 2 for line in horizontal_lines_manual + vertical_lines_manual],
        original_line_widths=[3 * (max_depth - line[0][0]) + 2 for line in horizontal_lines_manual + vertical_lines_manual]
    ))
    p.multi_line(xs='xs', ys='ys', source=manual_lines_source, line_color="white", line_width='line_widths')
    
    output_file(output_plot_filename, title=tab_title)
    show(p)

def draw_tree():
    output_notebook()
    df_original = pd.read_json(original_data_filename, orient='records', lines=True)
    df_original.dropna(subset=['species'], inplace=True)
    df_original.drop_duplicates(subset=['species', 'title'], inplace=True)
    df_original.fillna('', inplace=True)
    df_original['img_link'] = df_original['img_link'].apply(filter_img_links)
    df_original['species_info'] = df_original.apply(lambda row: {
        'species_name': extract_species(row['species']),
        'title': row['title'],
        'link': row['link'],
        'img_link': row['img_link'],
        'text': row['text']
    }, axis=1)
    
    df_scraped = pd.read_json(scraped_data_filename, orient='records', lines=True)
    df_scraped.fillna('', inplace=True)
    
    
    root_node = make_tree(root_taxon_name, rankings.index(root_taxon_rank), df_original, df_scraped)
    if not root_node:
        print("No valid branches found leading to species.")
        return
    
    calculate_subtree_sizes(root_node)
    assign_coordinates(root_node)
    
    nodes_list = []
    edges_list = []
    collect_nodes_edges(root_node, nodes_list, edges_list)
    
    draw_tree_bokeh(nodes_list, edges_list, horizontal_lines_manual, nodes_manual)

draw_tree()


# Drawing home page

yes i drew this manually.


In [None]:
import pandas as pd
from bokeh.models import ColumnDataSource, HoverTool, TapTool, CustomJS, WheelZoomTool, ResetTool, PanTool
from bokeh.plotting import figure, show, output_file

placeholder_image = "https://via.placeholder.com/150"
y_offset_internal = 0.3
y_offset_multiplier = 0.125
text_multiplier = 4
default_min_text_size = 14
plot_width = 1920
plot_height = 900

horizontal_lines = [
    [(0, 0), (1, 0)],
    [(1, -2.75), (2, -2.75)],
    [(1, 2.75), (2, 2.75)],
    [(2, 2.75), (3, 2.75)],
    [(2, -2.75), (3, -2.75)],
    [(3, 6.75), (3.5, 6.75)],
    [(3, 5.75), (3.5, 5.75)],
    [(3, 4.75), (3.5, 4.75)],
    [(3, 3.75), (3.5, 3.75)],
    [(3, 2.75), (3.5, 2.75)],
    [(3, 1.75), (3.5, 1.75)],
    [(3, 0.75), (3.5, 0.75)],
    [(3, -0.25), (3.5, -0.25)],
    [(3, -1.25), (3.5, -1.25)],
    [(3, -2.25), (3.5, -2.25)],
    [(3, -3.25), (3.5, -3.25)],
]

vertical_lines = [
    [(1, -2.75), (1, 2.75)],
    [(3, -1.25), (3, 6.75)],
    [(3, -2.25), (3, -3.25)],
]

# kochjar.me/spacer actually useful???
squirrel = """ 

                                                             _ 
                                                     . - ' `   ` } 
                                     _ . / )       /               } 
                                 . ' o       \ \   |               } 
                                 ' . _ _ _ . ' ` . \ \         { ` 
                                 / ` \ \ _ /     ,   ` .         } 
                                 \ \ = '   . - '       _ ` \ \     { 
                                   ` ' ` ;               ` ,     } 
                                         _ \ \               ;     } 
                                       / _ _ ` ; . . . - ' - - ' 
 """


nodes = [
    [0.05, 12.5, 6, "edge", "Source Code", "", "", "https://github.com/jan-mate/taxo_trees"],
    [1.5, 12.5, 36, "edge", "Wikipedia Taxonomic Trees", "", "", ""],
    [1.5, 9.5, 18, "edge", "12 interactive wiki taxonomic trees. You can\n\
hover and click on all* words to explore! You can\nalso zoom and move the trees around, for more\noptions click the buttons in top-right corner.\n\
All the data is from Wikipedia, and it certaintly\ncontains errors!\nChromium is recommended for this.", "", "", ""],

    [0.5, 0, 15, "node", "taxonomic trees", "images/taxotree4.png", "Interactive Taxonomic trees generated from Wikipedia", "https://kochjar.me/trees"],
    [1.5, 2.75, 15, "node", "plantae", "images/forsthaven.jpg", "Forst Botanisk Have in Charlottenlund is my fav collection of plants, go there :)", "https://en.wikipedia.org/wiki/Plant"],
    [1.5, -2.75, 15, "node", "animalia", "images/redpanda.jpeg", "red pandas so much cooler than fake pandas. if u prefer fake pandas ur not invited to my birthday party", "https://en.wikipedia.org/wiki/Animal"],
    [2.5, 2.75, 15, "node", "edible", "images/plants.jpg", "btw you can click here and see a huge tree of all edible plants :)", "/edible_plants"],
    [2.5, -2.75, 15, "node", "danish", "images/dk.png", "The 3rd best Nordic country (🇩🇰🇸🇪🇳🇴🇮🇸🇫🇮🇪🇪🇬🇱🇫🇴🇷🇺🇰🇵🇲🇰)", "https://en.wikipedia.org/wiki/Denmark"],
    [3.52, 6.75, 15, "edge", "fruit", "images/squirrel_mango.jpg", "Unlike much foods, fruit evolved to taste good for us, and we evolved to like it. Therefore it makes sense the tastiest food is fruit.", "/fruit"],
    [3.52, 5.75, 15, "edge", "veggies", "images/squirrel_pumpkin.png", "My favorite thing about being adult is you can eat veggies every day! And no one will stop you!", "/veggies"],
    [3.52, 4.75, 15, "edge", "nuts", "images/squirrel_fishing.png", "Top 10 nuts:\n10. Ginkgo nuts\n9.Chestnuts \n8. Pecan \n7. Hazelnut \n6. Cashew \n5. Brazil Nut\n4\n.Coconut \n3. Peanut \n2. Almond\n1. Walnut\n Honerable mention. cum", "/nuts"],
    [3.52, 3.75, 15, "edge", "(pseudo)cereals", "images/squirrel_cereal.jpg", "Just add milk and you have soup", "/cereals"],
    [3.52, 2.75, 15, "edge", "herbs ⋀ spices", "images/squirrel_chili.webp", "food must have been really bland back then since commited genocide to get spices", "/herbs_spices"],
    [3.52, 1.75, 15, "edge", "flowers", "images/flower_squirrel.jpg", "don't forget to eat the flowers", "/flowers"],
    [3.52, 0.75, 15, "edge", "forageable", "images/skeleton.jpg", "when you use this as ur foraging guide", "forageable"],
    [3.52, -0.25, 15, "edge", "roots", "images/squareroot.jpg", "√", "/roots"],
    [3.52, -1.25, 15, "edge", "leaf", "images/squirrel_eat_leaf.jpg", "i love spinach but like its much expensif and then u cook it, and it always leafs you wondering where 99% of it went :(", "/leaf"],
    [3.52, -2.25, 15, "edge", "dinosaurs", "images/goose.jpg", "The birds of Denmark", "/dkbirds"],
    [3.52, -3.25, 15, "edge", "titty animals", "images/squirrelske.jpg", "The mammals of Denmark", "/dkmammals"],

    [-15, 15, -60, "node", squirrel, "images/bobo.jpg", "ascii art by Joan G. Stark", "https://www.youtube.com/watch?v=1CIMGTO6aFc"],
]


def draw_manual_tree(horizontal_lines, vertical_lines, nodes, output_plot_filename="homepage.html"):
    nodes_df = pd.DataFrame(nodes, columns=['x', 'y', 'base_text_size', 'type', 'text', 'hover_img', 'hover_text', 'link'])
    max_depth = max(max(line[0][0], line[1][0]) for line in horizontal_lines + vertical_lines)
    
    xs_lines, ys_lines, line_widths = [], [], []
    for line in horizontal_lines + vertical_lines:
        xs_lines.append([line[0][0], line[1][0]])
        ys_lines.append([line[0][1], line[1][1]])
        depth = line[0][0]
        size_multiplier = max_depth - depth
        line_widths.append(3 * size_multiplier + 2)
    
    lines_source = ColumnDataSource(data=dict(
        xs=xs_lines,
        ys=ys_lines,
        line_widths=line_widths,
        original_line_widths=line_widths.copy()
    ))

    nodes_with_hover = nodes_df[(nodes_df['hover_img'] != "") | (nodes_df['hover_text'] != "")]
    nodes_without_hover = nodes_df[(nodes_df['hover_img'] == "") & (nodes_df['hover_text'] == "")]
    
    text_size_with_hover = [
        f"{base_size + (max_depth - x) * text_multiplier}px" 
        for x, base_size in zip(nodes_with_hover['x'], nodes_with_hover['base_text_size'])
    ]
    text_size_without_hover = [
        f"{base_size + (max_depth - x) * text_multiplier}px" 
        for x, base_size in zip(nodes_without_hover['x'], nodes_without_hover['base_text_size'])
    ]
    
    y_offset_scaled_with_hover = [
        (max_depth - x) * y_offset_multiplier + y_offset_internal if t == "node" else 0
        for x, t in zip(nodes_with_hover['x'], nodes_with_hover['type'])
    ]
    y_offset_scaled_without_hover = [
        (max_depth - x) * y_offset_multiplier + y_offset_internal if t == "node" else 0
        for x, t in zip(nodes_without_hover['x'], nodes_without_hover['type'])
    ]
    
    text_align_with_hover = nodes_with_hover['type'].apply(lambda t: 'left' if t == "edge" else 'center')
    text_align_without_hover = nodes_without_hover['type'].apply(lambda t: 'left' if t == "edge" else 'center')
    
    labels_with_hover_source = ColumnDataSource(data=dict(
        x=nodes_with_hover['x'], 
        y=nodes_with_hover['y'] + y_offset_scaled_with_hover,
        base_y=nodes_with_hover['y'],
        text=nodes_with_hover['text'], 
        hover_img=nodes_with_hover['hover_img'],
        hover_text=nodes_with_hover['hover_text'], 
        link=nodes_with_hover['link'],
        depth=nodes_with_hover['x'], 
        text_align=text_align_with_hover,
        text_size=text_size_with_hover,
        original_text_size=text_size_with_hover,
        original_y_offset=y_offset_scaled_with_hover
    ))
    
    labels_without_hover_source = ColumnDataSource(data=dict(
        x=nodes_without_hover['x'], 
        y=nodes_without_hover['y'] + y_offset_scaled_without_hover,
        base_y=nodes_without_hover['y'],
        text=nodes_without_hover['text'], 
        hover_img=nodes_without_hover['hover_img'],
        hover_text=nodes_without_hover['hover_text'], 
        link=nodes_without_hover['link'],
        depth=nodes_without_hover['x'], 
        text_align=text_align_without_hover,
        text_size=text_size_without_hover,
        original_text_size=text_size_without_hover,
        original_y_offset=y_offset_scaled_without_hover
    ))
    
    p = figure(title="",
        background_fill_color='black',
        border_fill_color='black',
        outline_line_color='black',
        sizing_mode="stretch_both",
        x_range=(0, max_depth + 0.35), y_range=(-5, 13))
    
    p.multi_line(xs='xs', ys='ys', source=lines_source, line_color="white", line_width='line_widths')
    
    text_renderer_with_hover = p.text(
        x='x', y='y', text='text', source=labels_with_hover_source,
        text_color='white', text_font_size='text_size', text_baseline='middle', text_align='text_align'
    )
    
    text_renderer_without_hover = p.text(
        x='x', y='y', text='text', source=labels_without_hover_source,
        text_color='white', text_font_size='text_size', text_baseline='middle', text_align='text_align'
    )
    
    callback = CustomJS(args=dict(source=labels_with_hover_source), code="""
        const selected_index = source.selected.indices[0];
        if (selected_index != null) {
            const url = source.data['link'][selected_index];
            if (url) {
                let newUrl;
                if (url.startsWith('http://') || url.startsWith('https://')) {
                    newUrl = url;
                } else {
                    const baseUrl = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
                    newUrl = new URL(url, baseUrl).href;
                }
                window.open(newUrl, "_blank");
            }
        }
        source.selected.indices = [];
    """)

    tap_tool = TapTool(callback=callback, renderers=[text_renderer_with_hover])
    p.add_tools(tap_tool)
    
    TOOLTIPS = """
    <div style="width: 350px; background-color: rgba(221, 221, 221, 1); padding: 10px;">
        <div style="text-align: center;">
            <img src="@hover_img" alt=""
                style="display: block; margin: 0 auto; max-height: 300px; max-width: 300px; height: auto; width: auto;" 
                border="0">
        </div>
        <div style="text-align: center; margin-top: 15px;">
            <span style="font-size: 16px; color: #000000;">@hover_text</span>
        </div>
    </div>
    """

    hover_tool = HoverTool(tooltips=TOOLTIPS, renderers=[text_renderer_with_hover], point_policy="follow_mouse")
    p.add_tools(hover_tool)
    
    zoom_callback = CustomJS(args=dict(
        labels_with_hover_source=labels_with_hover_source,
        labels_without_hover_source=labels_without_hover_source,
        lines_source=lines_source,
        plot=p
    ), code="""
        const x_range = plot.x_range.end - plot.x_range.start;
        

        const labels_with = labels_with_hover_source.data;
        const depths_with = labels_with['depth'];
        const n_labels_with = depths_with.length;
        for (let i = 0; i < n_labels_with; i++) {
            const depth = depths_with[i];
            const threshold = 35 - depth * 3;
            let text_size;
            if (x_range > threshold) {
                text_size = '0px';
            } else {
                text_size = labels_with['original_text_size'][i];
            }
            labels_with['text_size'][i] = text_size;
            labels_with['y'][i] = labels_with['base_y'][i] + labels_with['original_y_offset'][i];
        }
        labels_with_hover_source.change.emit();
        
        
        const labels_without = labels_without_hover_source.data;
        const depths_without = labels_without['depth'];
        const n_labels_without = depths_without.length;
        for (let i = 0; i < n_labels_without; i++) {
            const depth = depths_without[i];
            const threshold = 5;
            let text_size;
            if (x_range > threshold) {
                text_size = '0px';
            } else {
                text_size = labels_without['original_text_size'][i];
            }
            labels_without['text_size'][i] = text_size;
            labels_without['y'][i] = labels_without['base_y'][i] + labels_without['original_y_offset'][i];
        }
        labels_without_hover_source.change.emit();
    
        
        const lines_data = lines_source.data;
        const line_widths = lines_data['line_widths'];
        const original_line_widths = lines_data['original_line_widths'];
        const n_lines = line_widths.length;
        const scaling_factor = Math.max(0.1, Math.min(1, 800 / x_range));
        for (let i = 0; i < n_lines; i++) {
            const original_width = original_line_widths[i];
            let new_width = original_width * scaling_factor;
            line_widths[i] = new_width;
        }
        lines_source.change.emit();
    """)
    
    p.x_range.js_on_change('start', zoom_callback)
    p.x_range.js_on_change('end', zoom_callback)
    p.y_range.js_on_change('start', zoom_callback)
    p.y_range.js_on_change('end', zoom_callback)

    p.grid.visible = False
    p.axis.visible = False
    
    wheel_zoom_tool = WheelZoomTool()
    reset_tool = ResetTool()
    pan_tool = PanTool()

    p.toolbar.tools = [pan_tool, wheel_zoom_tool, tap_tool, hover_tool, reset_tool]

    p.toolbar.logo = None
    p.toolbar.active_scroll = wheel_zoom_tool
    p.toolbar.active_drag = pan_tool
    
    output_file(output_plot_filename)
    show(p)

draw_manual_tree(horizontal_lines, vertical_lines, nodes)


# Combining jsons

In [None]:
import pandas as pd
import os

def combine_jsons(json_files, output_main_file, output_scraped_file):
    combined_main_df = pd.DataFrame()
    combined_scraped_df = pd.DataFrame()
    
    for file in json_files:
        if os.path.isfile(file):
            print(f"Loading main file: {file}")
            df = pd.read_json(file, orient='records', lines=True)
            combined_main_df = pd.concat([combined_main_df, df], ignore_index=True)
        else:
            print(f"File not found: {file}")
    
    for file in json_files:
        scraped_file = file.replace('.json', '_scraped.json')
        if os.path.isfile(scraped_file):
            print(f"Loading scraped file: {scraped_file}")
            df = pd.read_json(scraped_file, orient='records', lines=True)
            combined_scraped_df = pd.concat([combined_scraped_df, df], ignore_index=True)
        else:
            print(f"Scraped file not found: {scraped_file}")
    
    for df in [combined_main_df, combined_scraped_df]:
        for column in df.columns:
            if df[column].apply(lambda x: isinstance(x, (dict, list))).any():
                df[column] = df[column].apply(lambda x: str(x) if isinstance(x, (dict, list)) else x)
    
    combined_main_df.drop_duplicates(inplace=True)
    combined_scraped_df.drop_duplicates(inplace=True)
    
    combined_main_df.to_json(output_main_file, orient='records', lines=True)
    combined_scraped_df.to_json(output_scraped_file, orient='records', lines=True)
    
    print(f"Combined main JSON saved to: {output_main_file}")
    print(f"Combined scraped JSON saved to: {output_scraped_file}")

json_files_to_combine = [
    'List_of_culinary_fruits.json',
    # 'List_of_birds_of_Denmark.json',
    # 'List_of_mammals_of_Denmark.json',
    'List_of_vegetables.json',
    'List_of_leaf_vegetables.json',
    'List_of_culinary_nuts.json',
    'Plants_used_as_herbs_or_spices.json',
    'List_of_edible_seeds.json',
    'List_of_edible_flowers.json',
    'List_of_forageable_plants.json'

]

output_main_filename = 'edible_plants.json'
output_scraped_filename = 'edible_plants_scraped.json'

combine_jsons(json_files_to_combine, output_main_filename, output_scraped_filename)
