To enhance efficiency, you can make some optimizations and improvements to your code. Here are some suggestions:

1. **Batch Requests**: Instead of sending individual HTTP requests for each language page, you can utilize batch processing or asynchronous requests to scrape multiple pages simultaneously, which can significantly reduce the overall execution time.

2. **Caching Mechanism**: Implement a caching mechanism to store previously scraped data, reducing the need to re-scrape pages that haven't changed. This can be especially useful if you anticipate running the script multiple times.

3. **Optimized XPath Queries**: Review the XPath queries used in your code and ensure they are as efficient as possible. Avoid overly broad queries that may unnecessarily traverse large portions of the HTML tree.

4. **Minimize External Requests**: Minimize the number of external requests by fetching only the necessary data from each page. For example, if you only need language status, extract only that information instead of parsing the entire page.

5. **Error Handling**: Implement robust error handling to gracefully handle exceptions, such as network errors or unexpected HTML structures. This ensures your script continues to run smoothly even in adverse conditions.

6. **Parallel Processing**: Utilize parallel processing techniques, such as multi-threading or multiprocessing, to perform tasks concurrently and make better use of available system resources.

7. **Code Refactoring**: Refactor your code to eliminate redundancy and improve readability. Look for opportunities to modularize repetitive tasks into functions or classes.

Applying these optimizations should help improve the efficiency and performance of your code. If you'd like more specific guidance on implementing any of these suggestions, feel free to ask!

In [None]:
import requests
from lxml import html
from urllib.parse import urljoin

# gets the name of every language and their links
def parse_languages(url):
    response = requests.get(url)
    tree = html.fromstring(response.content)

    # XPath to find all <p> tags with class containing `lang--` and extract the text within the nested <a> tags
    languages = tree.xpath('//p[contains(@class, "lang--")]/a/text()')
    languages = [lang.strip() for lang in languages if lang.strip()]

    # XPath to find the link for each language
    language_links = tree.xpath('//p[contains(@class, "lang--")]/a/@href')
    base_url = "https://www.ethnologue.com"  # Base URL of the website
    full_language_links = [urljoin(base_url, link) for link in language_links]

    return languages, full_language_links

# ensures every page is parsed so no language is missed
def find_links_to_other_pages(url):
    response = requests.get(url)
    tree = html.fromstring(response.content)

    # XPath to find all <button> elements for different alphabet letters
    links = tree.xpath('//button[@class="tab__link"]/@onclick')
    # Extract the URL part from the onclick attribute
    links = [link.split('"')[1] for link in links if 'browse' in link]
    # Convert to full URLs
    full_links = [urljoin(url, f'/browse/names/{link}') for link in links]

    return full_links

def main(start_url):
    visited = set()
    to_visit = [start_url]
    all_languages = []
    all_full_language_links = []  # Store all full language links

    while to_visit:
        current_url = to_visit.pop()
        if current_url in visited:
            continue

        visited.add(current_url)
        languages, full_language_links = parse_languages(current_url)
        all_languages.extend(languages)
        all_full_language_links.extend(full_language_links)  # Store full language links

        links = find_links_to_other_pages(current_url)
        to_visit.extend(links)

    return all_languages, all_full_language_links

start_url = 'https://www.ethnologue.com/browse/names/'
languages, full_language_links = main(start_url)

language_names = languages
language_urls = full_language_links

KeyboardInterrupt: 

In [None]:
from bs4 import BeautifulSoup
import time

# Function to scrape information from individual language pages
def scrape_language_info(language_url):
    try:
        # Send a GET request to the language page
        response = requests.get(language_url)
        response.raise_for_status()  # Raise an HTTPError for bad responses

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        data = {}

        # Extract language name
        title_home = soup.find('h1', class_='title__home')
        if title_home:
            data['name'] = title_home.text.strip()

        # Extract ISO 639 code
        iso_code_element = soup.find('a', class_='chip chip--big')
        if iso_code_element:
            data['iso_code'] = iso_code_element.text.strip()

        # Extract summary
        summary_element = soup.find('section', id='summary')
        if summary_element:
            summary_p = summary_element.find('p')
            if summary_p:
                data['summary'] = summary_p.text.strip()

        # Extract population info
        population_element = soup.find('li', class_='population__sizes')
        if population_element:
            population_div = population_element.find('div', class_='graph__langpop')
            if population_div:
                data['population'] = population_div.text.strip()

        # Extract language status (vitality)
        vitality_element = soup.find('li', class_='population__vitality')
        data['language_status'] = 'Unknown'  # Default value
        if vitality_element:
            vitality_status = vitality_element.find_all('li', class_='histogram__datum')
            for status in vitality_status:
                if 'data-count' in status.attrs and status['data-count'].isdigit() and int(status['data-count']) > 0:
                    label = status.find('label')
                    if label:
                        data['language_status'] = label.text.strip()
                        break

        return data

    except requests.RequestException as e:
        print(f"Request failed for URL {language_url}: {e}")
        return {}

# Introduce a delay of 3 seconds between each request
delay_between_requests = 3

# Combine language names and URLs into pairs
language_info_pairs = zip(language_names, language_urls)

# Dictionary to store language information
language_info_dict = {}

# Iterate over each language URL and scrape information
for language_name, language_url in language_info_pairs:
    language_info = scrape_language_info(language_url)
    language_info_dict[language_name] = language_info
    time.sleep(delay_between_requests)

### turning into JSON file for javascript to read

import json

# Convert the dictionary to JSON
json_data = json.dumps(language_info_dict)

# Specify the file name
file_name = "language_info.json"

# Write JSON data to a file
with open(file_name, "w") as json_file:
    json_file.write(json_data)

# Print the name of the created file
print(f"JSON data has been saved to '{file_name}'.")


# **the whole thing with improvements, but this actually sucks lol**

In [None]:
import aiohttp
import asyncio
from aiohttp import ClientSession
from lxml import html
from urllib.parse import urljoin
import json
from cachetools import TTLCache, cached

# Initialize cache with TTL (time-to-live) of 1 hour
cache = TTLCache(maxsize=1000, ttl=3600)

# Optimized function to parse languages
async def parse_languages(session, url):
    async with session.get(url) as response:
        content = await response.text()
        tree = html.fromstring(content)

        # Optimized XPath to find languages and their links
        languages = tree.xpath('//p[contains(@class, "lang--")]/a/text()')
        languages = [lang.strip() for lang in languages if lang.strip()]

        language_links = tree.xpath('//p[contains(@class, "lang--")]/a/@href')
        base_url = "https://www.ethnologue.com"
        full_language_links = [urljoin(base_url, link) for link in language_links]

        return languages, full_language_links

# Optimized function to find links to other pages
async def find_links_to_other_pages(session, url):
    async with session.get(url) as response:
        content = await response.text()
        tree = html.fromstring(content)

        links = tree.xpath('//button[@class="tab__link"]/@onclick')
        links = [link.split('"')[1] for link in links if 'browse' in link]
        full_links = [urljoin(url, f'/browse/names/{link}') for link in links]

        return full_links

# Async function to scrape language information
@cached(cache)
@cached(cache)
async def scrape_language_info(session, language_url):
    try:
        async with session.get(language_url) as response:
            content = await response.text()
            tree = html.fromstring(content)

            data = {}

            # Extracting summary
            summary_element = tree.xpath('//section[contains(@class, "summary")]//p')
            if summary_element:
                data['summary'] = summary_element[0].text_content().strip()

            # Extracting population
            population_element = tree.xpath('//li[contains(@class, "population__sizes")]//div[contains(@class, "graph__langpop")]')
            if population_element:
                data['population'] = population_element[0].text_content().strip()

            # Extracting language status
            vitality_element = tree.xpath('//li[contains(@class, "population__vitality")]//li[contains(@class, "histogram__datum") and @data-count]')
            for status in vitality_element:
                if int(status.get('data-count', 0)) > 0:
                    label = status.xpath('.//label')
                    if label:
                        data['language_status'] = label[0].text_content().strip()
                        break
            else:
                data['language_status'] = 'Unknown'

            # Example: Extracting region
            region_element = tree.xpath('//section[contains(@class, "region")]//p')
            if region_element:
                data['region'] = region_element[0].text_content().strip()

            # Example: Extracting dialects
            dialects_element = tree.xpath('//section[contains(@class, "dialects")]//li')
            if dialects_element:
                data['dialects'] = [dialect.text_content().strip() for dialect in dialects_element]

            # Example: Extracting language family
            family_element = tree.xpath('//section[contains(@class, "family")]//p')
            if family_element:
                data['family'] = family_element[0].text_content().strip()

            # Example: Extracting ISO code
            iso_element = tree.xpath('//section[contains(@class, "iso")]//p')
            if iso_element:
                data['iso_code'] = iso_element[0].text_content().strip()

            return data
    except Exception as e:
        print(f"Error scraping {language_url}: {e}")
        return {}

# Main function to manage the crawling process
async def main(start_url):
    visited = set()
    to_visit = [start_url]
    all_languages = []
    all_full_language_links = []

    async with ClientSession() as session:
        while to_visit:
            current_url = to_visit.pop()
            if current_url in visited:
                continue

            visited.add(current_url)
            languages, full_language_links = await parse_languages(session, current_url)
            all_languages.extend(languages)
            all_full_language_links.extend(full_language_links)

            links = await find_links_to_other_pages(session, current_url)
            to_visit.extend(links)

        # Batch processing of language information scraping
        tasks = [scrape_language_info(session, url) for url in all_full_language_links]
        language_info_list = await asyncio.gather(*tasks)

        # Combine language names with their respective information
        language_info_dict = {name: info for name, info in zip(all_languages, language_info_list)}

        # Convert the dictionary to JSON
        json_data = json.dumps(language_info_dict, indent=2)

        # Save JSON data to file
        with open('language_data.json', 'w') as json_file:
            json_file.write(json_data)

        return json_data

# Helper function to run the asyncio event loop in a Jupyter notebook
def run_asyncio_coroutine(coroutine):
    loop = asyncio.get_event_loop()
    if loop.is_running():
        return loop.create_task(coroutine)
    else:
        return loop.run_until_complete(coroutine)

# Entry point for the script
if __name__ == '__main__':
    start_url = 'https://www.ethnologue.com/browse/names'
    task = run_asyncio_coroutine(main(start_url))
    if isinstance(task, asyncio.Task):
        task.add_done_callback(lambda t: print(t.result()))
    else:
        print(task)

RuntimeError: asyncio.run() cannot be called from a running event loop