# Bible Explore

Exploring [Kaggle Bible Corpus](https://www.kaggle.com/oswinrh/bible) for similarity between passages.

Some extra information at [OpenBible](http://www.openbible.info/) and [Bible Databases](https://github.com/scrollmapper/bible_databases)

The current work is started in English and might be extended to other languages later on

In [1]:
import scipy as sp
import numpy as np
import pandas as pd
import csv
import os
import sys
import re
import seaborn as sns
import bokeh
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
import networkx as nx

%matplotlib inline

In [2]:
# https://stackoverflow.com/questions/18424228/cosine-similarity-between-2-number-lists
# from scipy.spatial import distance
# scipy.spatial.distance.cosine

# or with numpy:
# from numpy import dot
# from numpy.linalg import norm

# cos_sim = dot(a, b)/(norm(a)*norm(b))

In [3]:
# this is for Tensorflow to use the good GPU instead of the first it sees
# os.environ["CUDA_VISIBLE_DEVICES"]="1"
# and this is to set tf to use the GPU

In [4]:
print(tf.__version__)
print(tf.config.list_physical_devices())


2.3.2
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'), PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU'), PhysicalDevice(name='/physical_device:XLA_GPU:1', device_type='XLA_GPU')]


In [5]:
# loading data
corpus_path ="/home/leo/projects/AI/Datasets/text/religion/bible/kaggle-bible-corpus/t_asv.csv"
with open(corpus_path, newline='') as f:
    corpus = csv.reader(f)
    corpus_db = [r for r in corpus]
    verses = [r[-1] for r in corpus_db[1:]]  # omit the first line; It's the header


In [6]:
verses[0]

'In the beginning God created the heavens and the earth.'

In [7]:
# TF Universal Sentence Encoder models
#@title Load the Universal Sentence Encoder's TF Hub module
BASE_PATH = "/home/leo/projects/AI/Datasets/text/religion/bible/tf-hub"
# module_url = os.path.join(BASE_PATH, "universal-sentence-encoder-lite_2")
module_url = os.path.join(BASE_PATH, "universal-sentence-encoder-multilingual_3")
# module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
# module_url = "https://tfhub.dev/google/universal-sentence-encoder-lite/2"
# module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
# module_url = 'https://tfhub.dev/google/universal-sentence-encoder-large/5'

In [8]:
model = hub.load(module_url)
print ("module %s loaded" % module_url)

def embed(input):
  return model(input)

module /home/leo/projects/AI/Datasets/text/religion/bible/tf-hub/universal-sentence-encoder-multilingual_3 loaded


In [9]:
# I don't get wht this consumes so much CPU and memory and is even slower than the full cpu version
# %%time

# with tf.device('/GPU:1'):
#     model = hub.load(module_url)
#     print ("module %s loaded" % module_url)

#     def embed(input):
#       return model(input)

#     bible_embeddings = embed(verses)

In [10]:
%%time

bible_embeddings = embed(verses)

CPU times: user 2min 30s, sys: 33.7 s, total: 3min 3s
Wall time: 30.9 s


In [11]:
# bible_embeddings[0]

In [12]:
%%time
# similarity by inner product
similarity_matrix_inner = np.inner(bible_embeddings, bible_embeddings)

CPU times: user 11.7 s, sys: 1.1 s, total: 12.8 s
Wall time: 3.18 s


In [13]:
similarity_matrix_inner.shape

(31103, 31103)

In [14]:
def check_symmetric(a, rtol=1e-05, atol=1e-08):
    return np.allclose(a, a.T, rtol=rtol, atol=atol)

the matrix must be symmetric

In [15]:
check_symmetric(similarity_matrix_inner)

True

In [16]:
# labels = 

In [17]:
# too big for my PC 
# sns.set(font_scale=1.2)
# g = sns.heatmap(
#   similarity_matrix_inner,
# #       xticklabels=labels,
# #       yticklabels=labels,
#   vmin=0,
#   vmax=1,
#   cmap="YlOrRd")
# g.set_xticklabels(labels, rotation=90)
# g.set_title("Semantic Textual Similarity")


In [18]:
%%time
# get the closest and farthest ~ N for each
# https://www.kite.com/python/answers/how-to-find-the-n-maximum-indices-of-a-numpy-array-in-python
# https://numpy.org/doc/stable/reference/generated/numpy.argpartition.html
# https://numpy.org/doc/stable/reference/generated/numpy.partition.html

n = 21  # such as n>1 , when n==1 it shows only self-similarity
partitions = np.argpartition(similarity_matrix_inner, -n, axis=0)
closests = partitions[-n:]
farthest = partitions[:n]

CPU times: user 13.9 s, sys: 600 ms, total: 14.5 s
Wall time: 14.6 s


In [19]:
len(closests)
closests[0].shape

(31103,)

In [20]:
%%time
# now there are 2 arrays, each with verses number of elements, each array contains 
n_close = np.array(closests).transpose()
n_far = np.array(farthest).transpose()

CPU times: user 0 ns, sys: 3.11 ms, total: 3.11 ms
Wall time: 2.91 ms


In [21]:
similarity_matrix_inner.shape, n_close.shape, n_far.shape

((31103, 31103), (31103, 20), (31103, 20))

In [22]:
# clean up memory ..
del(closests)
del(farthest)


Show the similarity values for a few points

In [23]:
similarity_matrix_inner[0][n_close[0][:20]]

array([0.53220767, 0.535607  , 0.54301083, 0.55235803, 0.57052946,
       0.55402035, 0.5800376 , 0.5822105 , 0.5618566 , 0.56626326,
       0.57259107, 0.60005933, 0.5615088 , 0.5582243 , 0.6272632 ,
       0.65638304, 0.6330644 , 0.70625603, 0.64020944, 1.0000001 ],
      dtype=float32)

In [24]:
similarity_matrix_inner[0][n_far[0]]

array([ 0.09364344,  0.08305697,  0.06654037,  0.12651277,  0.07592306,
        0.14878714,  0.13647093,  0.13357072,  0.05880683, -0.02358215,
        0.06586871,  0.11249685,  0.14682116,  0.0791003 ,  0.10768902,
        0.11913791,  0.13447869,  0.15217146,  0.11318401,  0.13786386],
      dtype=float32)

In [25]:
# n_close_distance = np.take_along_axis(similarity_matrix_inner, n_close, axis=0)  #this does not work

In [26]:
# n_close_distance = similarity_matrix_inner[tuple(n_close)]  # this does not work either
# n_close_distance = similarity_matrix_inner[n_close]  # this does not work either, it overloads the memory

In [27]:
# tnclose = tuple(n_close)  

In [28]:
# n_far_distance =  similarity_matrix_inner[n_far]  # Nope, this overloads the memory

extract similarity values for the close elements (this will be useful for displaying later the 

In [33]:
%%time
close_matrix = np.array([similarity_matrix_inner[i][n_close[i]] for i in range(similarity_matrix_inner.shape[0])])
far_matrix = np.array([similarity_matrix_inner[i][n_far[i]] for i in range(similarity_matrix_inner.shape[0])])

CPU times: user 77.8 ms, sys: 0 ns, total: 77.8 ms
Wall time: 77.9 ms


In [34]:
close_matrix.shape, far_matrix.shape

((31103, 20), (31103, 20))

In [35]:
close_matrix[0]

array([0.53220767, 0.535607  , 0.54301083, 0.55235803, 0.57052946,
       0.55402035, 0.5800376 , 0.5822105 , 0.5618566 , 0.56626326,
       0.57259107, 0.60005933, 0.5615088 , 0.5582243 , 0.6272632 ,
       0.65638304, 0.6330644 , 0.70625603, 0.64020944, 1.0000001 ],
      dtype=float32)

In [30]:
# now I need to understand many MANY things, 
# compute the labels and IDs for each row
# give the text for each row
# create a networkx connection graph with it


Compute Human Readable Labels

In [37]:
corpus_db[:3]

[['id', 'b', 'c', 'v', 't'],
 ['1001001',
  '1',
  '1',
  '1',
  'In the beginning God created the heavens and the earth.'],
 ['1001002',
  '1',
  '1',
  '2',
  'And the earth was waste and void; and darkness was upon the face of the deep: and the Spirit of God moved upon the face of the waters.']]