# Bible Explore

Exploring [Kaggle Bible Corpus](https://www.kaggle.com/oswinrh/bible) for similarity between passages.

Some extra information at [OpenBible](http://www.openbible.info/) and [Bible Databases](https://github.com/scrollmapper/bible_databases)

The current work is started in English and might be extended to other languages later on

In [1]:
import scipy as sp
import numpy as np
import pandas as pd
import csv
import os
import sys
import re
from queue import Queue
from pathlib import Path
import seaborn as sns
import bokeh
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
import networkx as nx
from pyvis.network import Network
import pickle

%matplotlib inline

In [2]:
# https://stackoverflow.com/questions/18424228/cosine-similarity-between-2-number-lists
# from scipy.spatial import distance
# scipy.spatial.distance.cosine

# or with numpy:
# from numpy import dot
# from numpy.linalg import norm

# cos_sim = dot(a, b)/(norm(a)*norm(b))

In [3]:
# this is for Tensorflow to use the good GPU instead of the first it sees
# os.environ["CUDA_VISIBLE_DEVICES"]="1"
# and this is to set tf to use the GPU

In [4]:
print(tf.__version__)
print(tf.config.list_physical_devices())


2.3.2
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'), PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU'), PhysicalDevice(name='/physical_device:XLA_GPU:1', device_type='XLA_GPU')]


In [5]:
# loading data
# load chapters mappings
BASE_PATH_DB = Path("/home/leo/projects/AI/Datasets/text/religion/bible/kaggle-bible-corpus")
KEY_PATH_DB = BASE_PATH_DB / "key_english.csv"

corpus_path = BASE_PATH_DB / "t_asv.csv"
with open(corpus_path, newline='') as f:
    corpus = csv.reader(f)
    corpus_db = [r for r in corpus]
    verses = [r[-1] for r in corpus_db[1:]]  # omit the first line; It's the header

In [6]:
verses[0]

'In the beginning God created the heavens and the earth.'

In [7]:
key_verse_map = {}

with open(KEY_PATH_DB, newline='') as f:
    rows = csv.reader(f)
    kvs = [r for r in rows]
    key_verse_map = { i[0] : i[1] for i in kvs[1:]}

In [8]:
key_verse_map

{'1': 'Genesis',
 '2': 'Exodus',
 '3': 'Leviticus',
 '4': 'Numbers',
 '5': 'Deuteronomy',
 '6': 'Joshua',
 '7': 'Judges',
 '8': 'Ruth',
 '9': '1 Samuel',
 '10': '2 Samuel',
 '11': '1 Kings',
 '12': '2 Kings',
 '13': '1 Chronicles',
 '14': '2 Chronicles',
 '15': 'Ezra',
 '16': 'Nehemiah',
 '17': 'Esther',
 '18': 'Job',
 '19': 'Psalms',
 '20': 'Proverbs',
 '21': 'Ecclesiastes',
 '22': 'Song of Solomon',
 '23': 'Isaiah',
 '24': 'Jeremiah',
 '25': 'Lamentations',
 '26': 'Ezekiel',
 '27': 'Daniel',
 '28': 'Hosea',
 '29': 'Joel',
 '30': 'Amos',
 '31': 'Obadiah',
 '32': 'Jonah',
 '33': 'Micah',
 '34': 'Nahum',
 '35': 'Habakkuk',
 '36': 'Zephaniah',
 '37': 'Haggai',
 '38': 'Zechariah',
 '39': 'Malachi',
 '40': 'Matthew',
 '41': 'Mark',
 '42': 'Luke',
 '43': 'John',
 '44': 'Acts',
 '45': 'Romans',
 '46': '1 Corinthians',
 '47': '2 Corinthians',
 '48': 'Galatians',
 '49': 'Ephesians',
 '50': 'Philippians',
 '51': 'Colossians',
 '52': '1 Thessalonians',
 '53': '2 Thessalonians',
 '54': '1 Timothy

In [9]:
# TF Universal Sentence Encoder models
#@title Load the Universal Sentence Encoder's TF Hub module
MODELS_BASE_PATH = "/home/leo/projects/AI/Datasets/Tensorflow/tf-hub/"
# module_url = os.path.join(BASE_PATH, "universal-sentence-encoder-lite_2")
module_url = os.path.join(MODELS_BASE_PATH, "universal-sentence-encoder-multilingual_3")
# module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
# module_url = "https://tfhub.dev/google/universal-sentence-encoder-lite/2"
# module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
# module_url = 'https://tfhub.dev/google/universal-sentence-encoder-large/5'

In [10]:
model = hub.load(module_url)
print ("module %s loaded" % module_url)

def embed(input):
  return model(input)

module /home/leo/projects/AI/Datasets/Tensorflow/tf-hub/universal-sentence-encoder-multilingual_3 loaded


In [11]:
# I don't get wht this consumes so much CPU and memory and is even slower than the full cpu version
# %%time

# with tf.device('/GPU:1'):
#     model = hub.load(module_url)
#     print ("module %s loaded" % module_url)

#     def embed(input):
#       return model(input)

#     bible_embeddings = embed(verses)

In [12]:
%%time

bible_embeddings = embed(verses)

CPU times: user 2min 29s, sys: 33.8 s, total: 3min 3s
Wall time: 31.1 s


In [69]:
bible_embeddings.shape

TensorShape([31103, 512])

In [14]:
%%time
# similarity by inner product
similarity_matrix_inner = np.inner(bible_embeddings, bible_embeddings)

CPU times: user 10.9 s, sys: 1.15 s, total: 12.1 s
Wall time: 3.44 s


In [15]:
similarity_matrix_inner.shape

(31103, 31103)

In [16]:
def check_symmetric(a, rtol=1e-05, atol=1e-08):
    return np.allclose(a, a.T, rtol=rtol, atol=atol)

the matrix must be symmetric

In [17]:
check_symmetric(similarity_matrix_inner)

True

In [18]:
# labels = 

In [19]:
# too big for my PC 
# sns.set(font_scale=1.2)
# g = sns.heatmap(
#   similarity_matrix_inner,
# #       xticklabels=labels,
# #       yticklabels=labels,
#   vmin=0,
#   vmax=1,
#   cmap="YlOrRd")
# g.set_xticklabels(labels, rotation=90)
# g.set_title("Semantic Textual Similarity")


In [20]:
%%time
# get the closest and farthest ~ N for each
# https://www.kite.com/python/answers/how-to-find-the-n-maximum-indices-of-a-numpy-array-in-python
# https://numpy.org/doc/stable/reference/generated/numpy.argpartition.html
# https://numpy.org/doc/stable/reference/generated/numpy.partition.html

n = 21  # such as n>1 , when n==1 it shows only self-similarity
partitions = np.argpartition(similarity_matrix_inner, -n, axis=0)
closests = partitions[-n:]
farthest = partitions[:n]

CPU times: user 14 s, sys: 579 ms, total: 14.6 s
Wall time: 14.6 s


In [21]:
len(closests)
closests[0].shape

(31103,)

In [22]:
%%time
# now there are 2 arrays, each with verses number of elements, each array contains 
n_close = np.array(closests).transpose()
n_far = np.array(farthest).transpose()

CPU times: user 1.99 ms, sys: 396 µs, total: 2.39 ms
Wall time: 2.22 ms


In [23]:
similarity_matrix_inner.shape, n_close.shape, n_far.shape

((31103, 31103), (31103, 21), (31103, 21))

In [24]:
n_close[:2]

array([[15337, 31055, 26045, 19212, 24736, 16628, 13772, 26046, 15845,
        18579, 16474, 29973,   148,  5200, 28593,    31, 16625,    16,
           34, 15380,     0],
       [12522,  1910,   182,   190,    27, 14133, 19050,   194, 17829,
            8, 30527,  1928,   149,   176,   177, 25278,   178,     1,
           36,    37, 17530]])

In [25]:
# clean up memory ..
del(closests)
del(farthest)


Show the similarity values for a few points

In [26]:
similarity_matrix_inner[0][n_close[0][:20]]

array([0.5309429 , 0.53220767, 0.535607  , 0.54301083, 0.55235803,
       0.57052946, 0.55402035, 0.5800376 , 0.5822105 , 0.5618566 ,
       0.56626326, 0.57259107, 0.60005933, 0.5615088 , 0.5582243 ,
       0.6272632 , 0.65638304, 0.6330644 , 0.70625603, 0.64020944],
      dtype=float32)

In [27]:
similarity_matrix_inner[0][n_far[0]]

array([ 0.09364344,  0.08305697,  0.06654037,  0.12651277,  0.07592306,
        0.14878714,  0.13647093,  0.13357072,  0.05880683, -0.02358215,
        0.06586871,  0.11249685,  0.14682116,  0.0791003 ,  0.10768902,
        0.11913791,  0.13447869,  0.15217146,  0.11318401,  0.13786386,
        0.05275048], dtype=float32)

In [28]:
# n_close_distance = np.take_along_axis(similarity_matrix_inner, n_close, axis=0)  #this does not work

In [29]:
# n_close_distance = similarity_matrix_inner[tuple(n_close)]  # this does not work either
# n_close_distance = similarity_matrix_inner[n_close]  # this does not work either, it overloads the memory

In [30]:
# tnclose = tuple(n_close)  

In [31]:
# n_far_distance =  similarity_matrix_inner[n_far]  # Nope, this overloads the memory

extract similarity values for the close elements (this will be useful for displaying later the 

In [32]:
%%time
# somehow this is not sorted as it should
close_matrix = np.array([similarity_matrix_inner[i][n_close[i]] for i in range(similarity_matrix_inner.shape[0])])
far_matrix = np.array([similarity_matrix_inner[i][n_far[i]] for i in range(similarity_matrix_inner.shape[0])])

CPU times: user 70.8 ms, sys: 0 ns, total: 70.8 ms
Wall time: 71.3 ms


In [33]:
close_matrix.shape, far_matrix.shape

((31103, 21), (31103, 21))

In [34]:
close_matrix[0]

array([0.5309429 , 0.53220767, 0.535607  , 0.54301083, 0.55235803,
       0.57052946, 0.55402035, 0.5800376 , 0.5822105 , 0.5618566 ,
       0.56626326, 0.57259107, 0.60005933, 0.5615088 , 0.5582243 ,
       0.6272632 , 0.65638304, 0.6330644 , 0.70625603, 0.64020944,
       1.0000001 ], dtype=float32)

In [35]:
# now I need to understand many MANY things, 
# compute the labels and IDs for each row
# give the text for each row
# create a networkx connection graph with it


Compute Human Readable Labels

In [36]:
corpus_db[0]

['id', 'b', 'c', 'v', 't']

In [37]:
corpus_db[11000:11003]

[['13023016', '13', '23', '16', 'The sons of Gershom: Shebuel the chief.'],
 ['13023017',
  '13',
  '23',
  '17',
  'And the sons of Eliezer were: Rehabiah the chief; and Eliezer had no other sons; but the sons of Rehabiah were very many.'],
 ['13023018', '13', '23', '18', 'The sons of Izhar: Shelomith the chief.']]

In [38]:
%%time
# db contains all the information AND the embeddings, this also contains the graph information
bible_db = {}

for i in range(1, len(corpus_db)-1):
    verse = corpus_db[i]
#     k_id = int(verse[0])
    val = {
        'index':i-1,
        'id': int(verse[0]),
        'name': f"{key_verse_map[verse[1]]} {verse[2]}:{verse[3]}",
        'book_id': int(verse[1]),
        'chapter_id': int(verse[2]),
        'verse_id': int(verse[3]),
        'text': verse[4],
        'embedding': bible_embeddings[i],
        'close_to': n_close[i],  # ids
        'close_to_distance': close_matrix[i], 
        'far_from': n_far[i],  # ids
        'far_from_distance': far_matrix[i], 
    }
    bible_db[i-1] = val

CPU times: user 1.81 s, sys: 33.6 ms, total: 1.84 s
Wall time: 1.84 s


In [39]:
bible_db[10]

{'index': 10,
 'id': 1001011,
 'name': 'Genesis 1:11',
 'book_id': 1,
 'chapter_id': 1,
 'verse_id': 11,
 'text': "And God said, Let the earth put forth grass, herbs yielding seed, `and' fruit-trees bearing fruit after their kind, wherein is the seed thereof, upon the earth: and it was so.",
 'embedding': <tf.Tensor: shape=(512,), dtype=float32, numpy=
 array([ 0.09106142,  0.03639635,  0.07232352,  0.0180516 ,  0.00536718,
         0.03608396,  0.05259072, -0.02857068,  0.04545586,  0.06467625,
         0.06814095, -0.03513066,  0.02862555,  0.02917071, -0.00231802,
         0.00881511, -0.05753801,  0.04993311, -0.01497626,  0.01349286,
         0.01813623, -0.02739129,  0.03048592, -0.03268808, -0.07027489,
         0.06431985, -0.05512333, -0.06109827, -0.0350656 ,  0.0462325 ,
        -0.05182962, -0.06845409, -0.0513493 ,  0.02085048,  0.00632241,
         0.00783822,  0.04397808, -0.00993006, -0.05865693,  0.02339393,
         0.00523461, -0.06789356, -0.07403339, -0.0083824 , -

In [40]:
# save the DB
BIBLE_DB_PATH = "../db/bible-db.pkl"
with open(BIBLE_DB_PATH, 'wb') as f:
    pickle.dump(bible_db, f, pickle.HIGHEST_PROTOCOL)
    
## kind of big, 86 mb

In [41]:
BIBLE_EMBEDDINGS_PATH = "../db/bible-embeddings.pkl"
with open(BIBLE_EMBEDDINGS_PATH, 'wb') as f:
    pickle.dump(bible_embeddings, f, pickle.HIGHEST_PROTOCOL)
    

In [44]:
# function to get the subgraph 

def _get_edges(n_index, node, close_points):
    """
    n_index: node index in the encoding matrix
    node: the node in the db
    close_points: the number of close points to get
    """
    nd_weight = zip(node['close_to'], node['close_to_distance'])
    nd_edges = []
    for n,w in nd_weight:
        if n != n_index:
            # distance is farthest the smaller it is, so changing it to make the reverse relation
            nd_edges.append((n_index, n, 1/w))
        if len(nd_edges) >= close_points:
            break
    return nd_edges


def get_subgraph(bible_db, node_id, close_points=5, levels=2):
    """
    Get the subgraph from a node id
    bible_db: the entire bible database in a dict
    node_id: the node id
    close_points: the number of close points from each, => each point will have at most close_points outgoing edges
    levels: number of levels to go in depth for connections
    returns a networkx graph of the subgraph from the complete db centering the subgraph in the given node_id
    """
    g = nx.Graph()
    if node_id not in bible_db:
        # there is no graph to build
        return g
    # Recursive is SO intuitive, but will explode the stack and memory for big graphs
    nid = node_id
    nodes_to_add = []  # (node_id, group, size, label, title, txt)
    edges_to_add = []  # (node_id, node_id, weight)
    discovered = set()
    q = Queue()
    q.put((node_id, 0))  # keep (node, depth from center in levels)
    cnt = 0
    # this tree/ graph transversal is not WOW HOW EFFICIENT, but it works well enough
    while not q.empty() and cnt < levels+1:
        node_id, lvl = q.get()
        discovered.add(node_id)
        node = bible_db[node_id]
        nodes_to_add.append((node, lvl))
        # only add the edges if the level is not the max
        if lvl < levels:
            nd_edges = _get_edges(node_id, node, close_points)
            for edg in nd_edges:
                edges_to_add.append((edg, lvl))
                sn, en, w = edg
                if en not in discovered:
                    q.put((en, lvl+1))
        cnt = lvl
    # TODO
    for node, lvl in nodes_to_add:
#         if lvl < levels:
# #             g.add_node(nid)
        g.add_node(node['index'], size=20, group=node['book_id'], title=node['name'], data=node['text'])
#         g.add_node(int(nid), size=20, group=node['book_id'], title=node['name'], data=node['text'])
    
    for edg, lvl in edges_to_add:
        (sn, en, w) = edg
#         print(edg)
        # pyvis complains that this are not int fields!! (but they are)
        g.add_edge(int(sn), int(en), weight=w)
#         g.add_edge(sn, en)

    return nodes_to_add, edges_to_add, g

In [45]:
nodes, edges, sg = get_subgraph(bible_db, 127, close_points=5, levels=3)

In [46]:
# [(n[0]['index'], n[1]) for n in nodes]

In [47]:
# edges

In [48]:
type(sg)

networkx.classes.graph.Graph

In [49]:
sg.nodes[127]

{'size': 20,
 'group': 1,
 'title': 'Genesis 5:22',
 'data': 'and Enoch walked with God after he begat Methuselah three hundred years, and begat sons and daughters:'}

In [52]:
# options = {
#     'node_color': 'yellow',
#     'node_size': 600,
# #     'width': 3,
# #     'arrowstyle': '-|>',
# #     'arrowsize': 12,
# }
# nx.draw_networkx(sg, **options)
# plt.show()

In [53]:
# gg = nx.from_edgelist(list(sg.edges))

In [63]:
nt = Network('800px', '800px', notebook=True)
nt.from_nx(sg)

In [64]:
nt.show('nx.html')

In [100]:
def get_closest_points(txt, n=21, algorithm='inner'):
    """
    txt: the text to look for similarities
    n: the number of closest matches that will be searched
    algorithm: inner|cosine  # the algorithm to determine how the proximity is computed
    returns the closest n points to the input text based on the proximity algorithm
    """
    # TODO
    # compute input embedding 
    embd = embed([txt])
    # compute proximity with all the existing points
    similarity = np.inner(bible_embeddings, embd)
#     print(similarity.shape)
    # get the closest n points ids
    # such as n>1 , when n==1 it shows only self-similarity
    partitions = np.argpartition(similarity, -n, axis=0)
#     print(partitions.shape)
    n_close = partitions[-n:]
    n_far = partitions[:n]
    # needs a complete matrix
    return n_close, n_far


In [101]:
search_results = get_closest_points("and god is good")

In [102]:
search_results[0].shape

(21, 1)

In [110]:
closest = search_results[0][0][0]

In [111]:
closest

15106

In [112]:
nodes, edges, sg = get_subgraph(bible_db, closest, close_points=5, levels=3)

In [113]:
nt = Network('800px', '800px', notebook=True)
nt.from_nx(sg)
nt.show('search-results.html')