In [9]:
import numpy as np
import pandas as pd
import time, concurrent.futures, json, requests, re
import numpy as np
import networkx as nx
from threading import Lock


In [4]:
# request to get books
def getBooksThread(bookId, timeout=10):
    response_API = requests.get('https://gutendex.com/books/{}'.format(bookId), timeout=timeout)
    data = response_API.text
    parse_json = json.loads(data)
    if parse_json.get('detail') != None:
        # print(bookId)
        return 'NOT_FOUND'
    return parse_json 

def getBooksData(listBooks):
    print("Running get books threads:")
    threaded_start = time.time()
    booksData = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for bookId in listBooks:
            futures.append(executor.submit(getBooksThread, bookId))
        for future in concurrent.futures.as_completed(futures):
            if future.result() != 'NOT_FOUND':
                booksData.append(future.result())
            # print(future.result())
    print("Threaded get books", time.time() - threaded_start)
    return booksData



def getListBooks(listBooks):
    def transformData(d):
        res = []
        if d.get('formats')!=None:
            for t in d['formats'].keys():
                checkEnd = d['formats'][t].split('.').pop()
                if checkEnd == 'txt':
                    res.append({
                        'id': d['id'],
                        'text_url': d['formats'][t]
                    })
        return res
    
    data = getBooksData(listBooks)
    result = []
    
    print("Running get list books:")
    threaded_start = time.time()
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for d in data:
            futures.append(executor.submit(transformData, d))
        for future in concurrent.futures.as_completed(futures):
            result += future.result()
    print("Threaded get list books time:", time.time() - threaded_start)
    return result

def getTableIndex(listBooks):
    tableIndex = dict()
    booksInfo = []
    listBooksData = getListBooks(listBooks)

    lock = Lock()

    def readBook(book):
        response_API = requests.get(book['text_url'])
        data = response_API.text
        #### Option 1: Prendre seulement des mots avec carateres de 4 à 10
        words = re.findall(r"[A-Za-z]{4,10}\w+", data)
        occurentCounts = dict()

        for word in words:
            w = word.lower()
            lock.acquire()
            # Count for table index all books
            if w in tableIndex:
                if book['id'] in tableIndex[w]:
                    tableIndex[w][book['id']] += 1
                else:
                    tableIndex[w][book['id']] = 1
            else:
                tableIndex[w] = dict({book['id']: 1})

            # Count for table index for each book
            if w in occurentCounts:
                occurentCounts[w] += 1
            else:
                occurentCounts[w] = 1
            lock.release()

        return {
            "bookId": book['id'],
            "words": occurentCounts,
            "totalWords": len(words),
            "totalWordsWithOccur": len(occurentCounts)
        }
    print("Running table index:")
    threaded_start = time.time()
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for book in listBooksData:
            futures.append(executor.submit(readBook, book))
        for future in concurrent.futures.as_completed(futures):
            booksInfo.append(future.result())
    print("Threaded table index:", time.time() - threaded_start)
    return tableIndex, booksInfo



In [5]:
listBooks = [49345,56667,1,2,3,4,5,6,7]
tableIndexData, booksInfo = getTableIndex(listBooks)
historyWords =dict({
  "carver": 1, 
  "carvet": 1, 
  "saigon": 2, 
  "sargon": 3
})

print(tableIndexData['sargon'])


Running get books threads:
Threaded get books 1.1724321842193604
Running get list books:
Threaded get list books time: 0.0023260116577148438
Running table index:
Threaded table index: 2.3285748958587646
{49345: 98, 56667: 27}


In [8]:


def jaccard_similarity(x,y):
    """A function for finding the similarity between two binary vectors"""
    intersection = np.logical_and(x, y)
    union = np.logical_or(x, y)
    similarity = intersection.sum() / float(union.sum())
    return similarity

def jaccardCloseness(tableIndexData):
    # Init variable
    booksData = dict()
    for word in tableIndexData:
        for b in tableIndexData[word].keys(): 
            if b in booksData:
                booksData[b].update(dict({word:1}))
            else:
                booksData.update(dict({b: dict({word:1})}))
    bookDF = pd.DataFrame(booksData.values(),
        index=booksData.keys()).fillna(0)
    # print (bookDF)
    matrixCloseness = []
    for b1 in list(booksData.keys()):
        for b2 in list(booksData.keys()):
            if b1 != b2:
                res = jaccard_similarity(bookDF.loc[b1:b1],bookDF.loc[b2:b2])[0][0]
                if res*100 > 50: # > 50% -> add edge
                    matrixCloseness.append((b1,b2))
    
    # print(matrixCloseness)
    # Create the graph representing the reading app
    G = nx.Graph()
    G.add_edges_from(matrixCloseness)
    closenessData = []

    # Compute the closeness centrality of each node in the graph
    closeness_centrality = nx.closeness_centrality(G)

    # Print the closeness centrality of each node
    for node, closeness in closeness_centrality.items():
        closenessData.append({"bookId": node, "closeness":closeness })

    sortedClosenessData = sorted(closenessData, key=lambda d: d['closeness'], reverse=True) 
    return sortedClosenessData

jaccardCloseness(tableIndexData)


  intersection = np.logical_and(x, y)
  union = np.logical_or(x, y)


TypeError: cannot convert the series to <class 'float'>