In [1]:
import json, requests, time, re
import concurrent.futures
from threading import Lock

In [2]:
# request to get books

def getBooksData(listBooks):
    def getBooksThread(bookId, timeout=10):
        response_API = requests.get('https://gutendex.com/books/{}'.format(bookId), timeout=timeout)
        data = response_API.text
        parse_json = json.loads(data)
        if parse_json.get('detail') != None:
            # print(bookId)
            return 'NOT_FOUND'
        return parse_json 

    print("Running get books threads:")
    threaded_start = time.time()
    booksData = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for bookId in listBooks:
            futures.append(executor.submit(getBooksThread, bookId))
        for future in concurrent.futures.as_completed(futures):
            if future.result() != 'NOT_FOUND':
                booksData.append(future.result())
            # print(future.result())
    print("Threaded get books", time.time() - threaded_start)
    return booksData



def getListBooks(listBooks):
    def transformData(d):
        res = []
        if d.get('formats')!=None:
            for t in d['formats'].keys():
                checkEnd = d['formats'][t].split('.').pop()
                if checkEnd == 'txt':
                    res.append({
                        'id': d['id'],
                        'text_url': d['formats'][t]
                    })
        return res
    
    data = getBooksData(listBooks)
    result = []
    
    print("Running get list books:")
    threaded_start = time.time()
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for d in data:
            futures.append(executor.submit(transformData, d))
        for future in concurrent.futures.as_completed(futures):
            result += future.result()
    print("Threaded get list books time:", time.time() - threaded_start)
    return result

def getTableIndex(listBooks):
    tableIndex = dict()
    booksInfo = []
    listBooksData = getListBooks(listBooks)

    lock = Lock()

    def readBook(book):
        response_API = requests.get(book['text_url'])
        data = response_API.text
        #### Option 1: Prendre seulement des mots avec carateres de 4 à 10
        words = re.findall(r"[A-Za-z]{4,10}\w+", data)
        occurentCounts = dict()

        for word in words:
            w = word.lower()
            lock.acquire()
            # Count for table index all books
            if w in tableIndex:
                if book['id'] in tableIndex[w]:
                    tableIndex[w][book['id']] += 1
                else:
                    tableIndex[w][book['id']] = 1
            else:
                tableIndex[w] = dict({book['id']: 1})

            # Count for table index for each book
            if w in occurentCounts:
                occurentCounts[w] += 1
            else:
                occurentCounts[w] = 1
            lock.release()

        return {
            "bookId": book['id'],
            "words": occurentCounts,
            "totalWords": len(words),
            "totalWordsWithOccur": len(occurentCounts)
        }
    print("Running table index:")
    threaded_start = time.time()
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for book in listBooksData:
            futures.append(executor.submit(readBook, book))
        for future in concurrent.futures.as_completed(futures):
            booksInfo.append(future.result())
    print("Threaded table index:", time.time() - threaded_start)
    return tableIndex, booksInfo



In [3]:

listBooks = [49345,56667,1,2,3,4,5,6,7]
listBooks = [l for l in range(1,100)] + [49345,56667]

tableIndexData, booksInfo = getTableIndex(listBooks)
historyWords =dict({
  "carver": 1, 
  "carvet": 1, 
  "saigon": 2, 
  "sargon": 3
})

print(tableIndexData['sargon'])
# [print(b) for b in booksInfo]


Running get books threads:


ConnectionError: HTTPSConnectionPool(host='gutendex.com', port=443): Max retries exceeded with url: /books/5 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x10cfb0700>: Failed to establish a new connection: [Errno 61] Connection refused'))