In [1]:
import time, concurrent.futures, json, requests, re
from threading import Lock
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import networkx as nx

In [2]:
#############################################
# ----- CODE FROM FILE back/thread.py ----- #
#############################################

# If we want to return data => returnStatus = True 
# If not => funnction will show empty list
# type = 1 => list , type = 2 => object
def baseThreadPool(loopList, callback, returnStatus=False, type = 1):
    data = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for item in loopList:
            futures.append(executor.submit(callback, item))
        if returnStatus:
            for future in concurrent.futures.as_completed(futures):
                if type == 1:
                    data.append(future.result())
                else: 
                    data += future.result()
    return data

In [4]:
##################################################
# ----- CODE FROM FILE back/getBooksApi.py ----- #
##################################################

# request to get books
def getBooksThread(bookId, timeout=10):
    response_API = requests.get('https://gutendex.com/books/{}'.format(bookId), timeout=timeout)
    data = response_API.text
    parse_json = json.loads(data)
    if parse_json.get('detail') != None:
        # print(bookId)
        return 'NOT_FOUND'
    return parse_json 

def getBooksData(listBooks):
    print("RUNNING function getBooksData")
    threaded_start = time.time()
    booksData = baseThreadPool(listBooks, getBooksThread, True)
    print("END function getBooksData -----> {}".format(time.time() - threaded_start))
    return booksData

def getListBooks(listBooks):
    print("RUNNING function getListBooks")
    threaded_start = time.time()
    def transformData(d):
        res = []
        if d.get('formats')!=None:
            for t in d['formats'].keys():
                checkEnd = d['formats'][t].split('.').pop()
                if checkEnd == 'txt':
                    res.append({
                        'id': d['id'],
                        'text_url': d['formats'][t]
                    })
        return res
    allBooks = getBooksData(listBooks)    
    result = baseThreadPool(allBooks, transformData, True, 2) 
    print("END function getListBooks -----> {}".format(time.time() - threaded_start))
    return result, allBooks

In [11]:
#################################################
# ----- CODE FROM FILE back/tableIndex.py ----- #
#################################################

# get table index for all book and each book
def getTableIndex(listBooks):
    print('RUNNING function getTableIndex')
    threaded_start = time.time()
    tableIndex = dict()
    booksInfo = []
    listBooksData, allBooks = getListBooks(listBooks)

    def readBook(book):
        response_API = requests.get(book['text_url'])
        data = response_API.text
        lock = Lock()

        #### Option 1: Prendre seulement des mots avec carateres de 4 à 10
        words = re.findall(r"[A-Za-z]{4,10}\w+", data)
        occurentCounts = dict()

        def filterBooks(word):
            lock.acquire()
            w = word.lower()
            # Count for table index all books
            if w in tableIndex:
                if book['id'] in tableIndex[w]:
                    tableIndex[w][book['id']] += 1
                else:
                    tableIndex[w][book['id']] = 1
            else:
                tableIndex[w] = dict({book['id']: 1})

            # Count for table index for each book
            if w in occurentCounts:
                occurentCounts[w] += 1
            else:
                occurentCounts[w] = 1
            lock.release()

        baseThreadPool(words, filterBooks, False)

        return {
            "bookId": book['id'],
            "words": occurentCounts,
            "totalWords": len(words),
            "totalWordsWithOccur": len(occurentCounts)
        }
    booksInfo = baseThreadPool(listBooksData, readBook, True)

    print('END function getTableIndex -----> {}'.format(time.time() - threaded_start))
    return tableIndex, booksInfo, allBooks



In [6]:
#############################################
# ----- CODE FROM FILE back/cosine.py ----- #
#############################################

# this function use cosine similarity
def cosineSearchWord(historyWords, tableIndexData):
    print("RUNNING function cosineSearchWord")
    threaded_start = time.time()
    # Init variable
    result = dict()
    booksData = dict({'history':historyWords})
    for word in historyWords.keys():
        if word in tableIndexData:
            for b in tableIndexData[word].keys(): 
                if b in booksData:
                    booksData[b].update(dict({word:tableIndexData[word][b]}))
                else:
                    booksData.update(dict({b: dict({word:tableIndexData[word][b]})}))
    bookDF = pd.DataFrame(booksData.values(),
        index=booksData.keys()).fillna(0)
    for cs in list(booksData.keys())[1:]:
        result[cs] = cosine_similarity(bookDF.loc["history":"history"],bookDF.loc[cs:cs])[0][0]

    sortedBooks = dict(sorted(result.items(),key=lambda x:x[1], reverse=True))

    print("END function cosineSearchWord -----> {}".format(time.time() - threaded_start))
    return sortedBooks

def getMatrixCloseness(tableIndexData):
    print("RUNNING function getMatrixCloseness")
    threaded_start = time.time()
    # Init variable
    booksData = dict()
    def transformTableCloseness(word):
        for b in tableIndexData[word].keys(): 
            if b in booksData:
                booksData[b].update(dict({word:tableIndexData[word][b]}))
            else:
                booksData.update(dict({b: dict({word:tableIndexData[word][b]})}))
    baseThreadPool(tableIndexData, transformTableCloseness)

    bookDF = pd.DataFrame(booksData.values(),
        index=booksData.keys()).fillna(0)

    matrixCloseness = []

    def getCloseness(b1,b2):
        if b1 != b2:
            res = cosine_similarity(bookDF.loc[b1:b1],bookDF.loc[b2:b2])[0][0]
            if res*100 > 50: # > 50% -> add edge
                matrixCloseness.append((b1,b2))

    # Loop 1 thread
    def closenessThread1(b1):
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = []
            for b2 in list(booksData.keys()):
                futures.append(executor.submit(getCloseness, b1,b2))
    baseThreadPool(list(booksData.keys()), closenessThread1)

    # Create the graph representing the reading app
    G = nx.Graph()
    G.add_edges_from(matrixCloseness)
    closenessData = []

    # Compute the closeness centrality of each node in the graph
    closeness_centrality = nx.closeness_centrality(G)

    # Print the closeness centrality of each node
    for node, closeness in closeness_centrality.items():
        closenessData.append({"bookId": node, "closeness":closeness })

    sortedClosenessData = sorted(closenessData, key=lambda d: d['closeness'], reverse=True) 
    print("END function getMatrixCloseness -----> {}".format(time.time() - threaded_start))
    return sortedClosenessData


In [8]:
listBooks = [49345,56667,1,2,3,4,5,6,7]
# listBooks = [l for l in range(1,20)]
historyWords = dict({"sargon": 3, "saigon": 1})
clickedBooks = dict()
lastSearchWord = dict({"word": "saigon"})
suggestionObject = dict({"data": [],"status" : True})
lastSearchObject = dict({"data": [],"status" : True})
rankingObject = dict({"data": [],"status" : True})
mostReadObject = dict({"data": [],"status" : True})
booksInfoObject = dict({"data": [],"status" : True})
allBooksoObject = dict({"data": [],"status" : True})
closenessDataObject = dict({"data": [],"status" : True})
tableIndexDataObject =   dict({"data": dict(),"status" : True})
loadingBack = dict({"status": True})
lastReadingBook = dict({"bookId": None, "data": "", "link": ""})

In [13]:
#####################################################
# ----- TEST FOR ROUTE http://127.0.0.1:5000/ ----- #
#####################################################

def index():
    print('START LOADING DATA')
    loading_time = time.time()
    tableIndexDataObject['data'], booksInfoObject['data'], allBooksoObject['data'] = getTableIndex(listBooks)
    tableIndexDataObject['status'] = False
    booksInfoObject['status'] = False
    allBooksoObject['status'] = False
    closenessDataObject['data'] = getMatrixCloseness(tableIndexDataObject['data'])
    closenessDataObject['status'] = False
    loadingBack['status'] = False
    print('END LOADING DATA -> {}'.format(time.time() - loading_time))

index()

START LOADING DATA
RUNNING function getTableIndex
RUNNING function getListBooks
RUNNING function getBooksData
END function getBooksData - 0.4752471446990967
END function getListBooks 0.47687506675720215
END function getTableIndex -> 6.985242128372192
RUNNING function getMatrixCloseness
END function getMatrixCloseness -> 0.7934279441833496
END LOADING DATA -> 7.785411834716797


In [None]:
###########################################################
# ------------------------------------------------------- #
# ----- TEST FOR ROUTE http://127.0.0.1:5000/cosine ----- #
# ------------------------------------------------------- #
# -- This route for get ranking with cosine similarity -- #
# ------------------------------------------------------- #
###########################################################

def cosine():
    print("RUN ROUTE /cosine")
    time_start = time.time()
    if rankingObject["status"] and not tableIndexDataObject['status']:
        ranking = []
        booksData = cosineSearchWord(historyWords, tableIndexDataObject['data'])
        for id,val in enumerate(list(booksData)):
            if id > 10:
                break
            else:
                ranking.append(getBooksThread(val))
        rankingObject["data"] = ranking
        rankingObject["status"] = False
    print('END ROUTE /cosine ----------> {}'.format(time.time() - time_start))
    # return jsonify(rankingObject["data"])

cosine()

In [None]:
#############################################################
# --------------------------------------------------------- #
# ----- TEST FOR ROUTE http://127.0.0.1:5000/mostread ----- #
# --------------------------------------------------------- #
# -- This route for get ranking with most clicked books --- #
# --------------------------------------------------------- #
#############################################################

def most_read():
    print("RUN ROUTE /mostread")
    time_start = time.time()
    if mostReadObject['status']:
        sortedClickedBooks = dict(sorted(clickedBooks.items(),key=lambda x:x[1], reverse=True) )
        ranking = []
        for id,val in enumerate(list(sortedClickedBooks.keys())):
            if id > 10:
                break
            else:
                ranking.append(getBooksThread(val))
        mostReadObject["data"] = ranking
        mostReadObject["status"] = False
    print('END ROUTE /mostread ----------> {}'.format(time.time() - time_start))
    # return jsonify(mostReadObject['data'])

most_read()

In [None]:
###################################################################
# --------------------------------------------------------------- #
# ------- TEST FOR ROUTE http://127.0.0.1:5000/lastsearch ------- #
# --------------------------------------------------------------- #
# --- This route for get data of the last search by keyword  ---- #
# --------------------------------------------------------------- #
###################################################################

def last_search():
    print('RUN ROUTE /lastsearch')
    time_start = time.time()
    if lastSearchObject["status"]:
        lastSearch = lastSearchWord["word"]
        sortedBooks = dict()
        if tableIndexDataObject['data'].get(lastSearch)!=None and lastSearch != "":
            # print(jsonify(tableIndexData[word]))
            sortedBooks = dict(sorted(tableIndexDataObject['data'][lastSearch].items(),key=lambda x:x[1], reverse=True))
        lastSearchObject["data"] = getBooksData(list(sortedBooks.keys()))
        lastSearchObject["status"] = False
        print('END ROUTE /lastsearch ----------> {}'.format(time.time() - time_start))
    # return jsonify(lastSearchObject["data"])

last_search()

In [None]:
######################################################################
# ------------------------------------------------------------------ #
# --------- TEST FOR ROUTE http://127.0.0.1:5000/suggestion -------- #
# ------------------------------------------------------------------ #
# --- This route for suggest data from the last search keyword  ---- #
# ------------------------------------------------------------------ #
###################################################################### 

def suggestion():
    print('RUN ROUTE /suggestion')
    threaded_start = time.time()
    if suggestionObject["status"] and not tableIndexDataObject['status'] and not closenessDataObject['status']:
        # Init variable
        lastSearch = lastSearchWord["word"]
        sortedBooks = dict()
        suggestionBooks = []
        lock = Lock()
        
        if tableIndexDataObject['data'].get(lastSearch)!=None and lastSearch != "":
            sortedBooks = dict(sorted(tableIndexDataObject['data'][lastSearch].items(),key=lambda x:x[1], reverse=True))

        def checkCloseness(closenessPos, suggestionBooks, sortedBooks):
            return closenessPos not in suggestionBooks and closenessPos not in list(sortedBooks.keys())

        def getSuggestion(id,closeData):
            lock.acquire()
            if closeData['bookId'] in list(sortedBooks.keys()):
                if id==0:
                    if checkCloseness(closenessDataObject['data'][id+1]['bookId'] , suggestionBooks, sortedBooks):
                        suggestionBooks.append(closenessDataObject['data'][id+1]['bookId']) 
                elif id==len(closenessDataObject['data'])-1:
                    if checkCloseness(closenessDataObject['data'][id-1]['bookId'] , suggestionBooks, sortedBooks):
                        suggestionBooks.append(closenessDataObject['data'][id-1]['bookId']) 
                else:
                    if checkCloseness(closenessDataObject['data'][id+1]['bookId'] , suggestionBooks, sortedBooks):
                        suggestionBooks.append(closenessDataObject['data'][id+1]['bookId']) 
                    if checkCloseness(closenessDataObject['data'][id-1]['bookId'] , suggestionBooks, sortedBooks):
                        suggestionBooks.append(closenessDataObject['data'][id-1]['bookId']) 
            lock.release()

        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = []
            for id,closeData in enumerate(closenessDataObject['data']):
                futures.append(executor.submit(getSuggestion, id,closeData))

        suggestionObject["data"] = getBooksData(suggestionBooks)     
        suggestionObject["status"] = False
    print('END ROUTE /suggestion ----------> {}'.format(time.time() - threaded_start))
    # return jsonify(suggestionObject["data"])

    suggestion()

In [None]:
######################################################################
# ------------------------------------------------------------------ #
# ----- TEST FOR ROUTE http://127.0.0.1:5000/searchbook/<word> ----- #
# ------------------------------------------------------------------ #
# ------ This route for find books with <word> in table index ------ #
# ------------------------------------------------------------------ #
###################################################################### 

def search_books(word):
    print("RUN ROUTE /searchbook/<word>")
    time_start = time.time()
    if not tableIndexDataObject['status']:
        if tableIndexDataObject['data'].get(word)!=None:
            # print(jsonify(tableIndexData[word]))
            sortedBooks = dict(sorted(tableIndexDataObject['data'][word].items(),key=lambda x:x[1], reverse=True))
            bookData =  getBooksData(sortedBooks.keys())
            # return jsonify(bookData)
        else:
            return "NOT_FOUND"
    print("END ROUTE /searchbook/<word> ----------> {}".format(time.time() - time_start))
    return "NO TABLE INDEX AVAILABLE"

search_books('saigon')

In [None]:
#############################################################################
# ------------------------------------------------------------------------- #
# ----- TEST FOR ROUTE http://127.0.0.1:5000/readbookcontent/<bookId> ----- #
# ------------------------------------------------------------------------- #
# ----------- This route for reading book from book id <bookId> ----------- #
# ------------------------------------------------------------------------- #
#############################################################################

def read_book_content(bookId):
    print("RUN ROUTE /readbookcontent/<bookId>")
    time_start = time.time()
    if lastReadingBook['bookId'] != bookId:
        book_data = getBooksThread(bookId)
        for format in book_data['formats'].keys():
            if '.htm' in book_data['formats'][format] or '.html.images' in book_data['formats'][format]:
                lastReadingBook['link'] = book_data['formats'][format]
                response_API = requests.get(book_data['formats'][format])
                lastReadingBook['data'] = response_API.text 
        lastReadingBook['bookId'] = bookId

    pattern = re.compile(r'<body>(.*?)</body>', re.DOTALL)
    result = re.search(pattern, lastReadingBook['data'])

    if result:
        body_content = result.group(1)
        replace_image = body_content.replace('images/', 'https://www.gutenberg.org/cache/epub/{}/images/'.format(bookId))
        lastReadingBook['data'] = replace_image
    print("END ROUTE /readbookcontent/<bookId> ----------> {}".format(time.time() - time_start))
    # return jsonify({ "link": lastReadingBook['link'],'textHtml' : lastReadingBook['data']})

read_book_content(56667)
        