In [35]:
from threading import Thread
import time
import requests
import concurrent.futures
import json
import sys

In [None]:

def get_wiki_page_existence(wiki_page_url, timeout=10):
    response = requests.get(url=wiki_page_url, timeout=timeout)

    page_status = "unknown"
    if response.status_code == 200:
        page_status = "exists"
    elif response.status_code == 404:
        page_status = "does not exist"

    return wiki_page_url + " - " + page_status

In [None]:
url = "https://en.wikipedia.org/wiki/Ocean"
print(get_wiki_page_existence(wiki_page_url=url))

In [None]:
wiki_page_urls = ["https://en.wikipedia.org/wiki/" + str(i) for i in range(50)]

print("Running without threads:")
without_threads_start = time.time()
for url in wiki_page_urls:
    print(get_wiki_page_existence(wiki_page_url=url))
print("Without threads time:", time.time() - without_threads_start)

In [None]:
wiki_page_urls = ["https://en.wikipedia.org/wiki/" + str(i) for i in range(50)]

print("Running threaded:")
threaded_start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = []
    for url in wiki_page_urls:
        futures.append(executor.submit(get_wiki_page_existence, wiki_page_url=url))
    for future in concurrent.futures.as_completed(futures):
        print(future.result())
print("Threaded time:", time.time() - threaded_start)

In [39]:
def getBooksData(bookId, timeout=10):
    response_API = requests.get('https://gutendex.com/books/{}'.format(bookId), timeout=timeout)
    data = response_API.text
    parse_json = json.loads(data)
    if parse_json.get('detail') != None:
        print(bookId)
        return 'NOT_FOUND'
    return parse_json 

In [42]:
listBooks = [l for l in range(1,1001)]
def getBooksThreadPool(listBooks):
    print("Running threaded:")
    threaded_start = time.time()
    booksData = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for bookId in listBooks:
            futures.append(executor.submit(getBooksData, bookId))
        for future in concurrent.futures.as_completed(futures):
            if future.result() != 'NOT_FOUND':
                booksData.append(future.result())
            # print(future.result())
    print("Threaded time:", time.time() - threaded_start)
    return booksData

data = getBooksThreadPool(listBooks)
print(len(data))

Running threaded:
40
182
183
184
186
185
187
189
188
190
193
191
195
192
194
197
196
198
199
576
900
Threaded time: 36.49359393119812
979
