In [18]:
#!/usr/bin/env python
# coding: utf-8

# In[ ]:


from bs4 import BeautifulSoup
from tqdm import tqdm

# In[ ]:

import time
import requests
import pandas as pd
# import required module
import unidecode
# In[ ]:


result = []
for i in range(1, 11):
    html = requests.get(f"http://quotes.toscrape.com/page/{i}")
    html_tree = BeautifulSoup(html.text)
    authors_blocks = html_tree.find_all("div", class_="quote")
    for author_block in authors_blocks:
        text = author_block.find("span", attrs={"class":"text"}).text
        author = author_block.find("small", attrs={"class":"author"}).text
        tags = " ".join([tag.text for tag in author_block.find_all("a", attrs={"class":"tag"})]) 
        result.append([author, text, tags])

In [20]:
authors_names = [item[0] for item in result]
authors_names = list(set(authors_names))
result2 = []

for name in tqdm(authors_names):
    new_name = name.replace(".", " ")
    new_name = unidecode.unidecode(new_name)
    html = requests.get(f"http://quotes.toscrape.com/author/{'-'.join(new_name.split())}")
    html_tree = BeautifulSoup(html.text)
    birthday = html_tree.find("span", attrs={"class":"author-born-date"}).text
    location = html_tree.find("span", attrs={"class":"author-born-location"}).text
    description = html_tree.find("div", attrs={"class":"author-description"}).text
    result2.append([name, birthday, location, description])    
df = pd.DataFrame(result2, columns=["name", "birthday", "location", "description"])        

100%|██████████| 50/50 [00:17<00:00,  2.86it/s]


Below you find the asyncio implemetion which is much faster than the standard syncronuous implementation

In [21]:
! pip install aiohttp # install aiohttp if you haven't already

Defaulting to user installation because normal site-packages is not writeable


In [16]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from tqdm import tqdm
import unidecode
# List of authors' names from your result
authors_names = [item[0] for item in result]
authors_names = list(set(authors_names))
result2 = []

async def fetch_author_info(session, name):
    new_name = name.replace(".", " ")
    new_name = unidecode.unidecode(new_name)
    async with session.get(f"http://quotes.toscrape.com/author/{'-'.join(new_name.split())}") as response:
        html = await response.text()
        html_tree = BeautifulSoup(html, "html.parser")
        birthday = html_tree.find("span", attrs={"class": "author-born-date"}).text
        location = html_tree.find("span", attrs={"class": "author-born-location"}).text
        description = html_tree.find("div", attrs={"class": "author-description"}).text
        return [name, birthday, location, description]


async with aiohttp.ClientSession() as session:
    tasks = [fetch_author_info(session, name) for name in tqdm(authors_names)]
    result2.extend(await asyncio.gather(*tasks))
df = pd.DataFrame(result2, columns=["name", "birthday", "location", "description"])    

100%|██████████| 50/50 [00:00<00:00, 537731.28it/s]
