# Bible Explore

Exploring [Kaggle Bible Corpus](https://www.kaggle.com/oswinrh/bible) for similarity between passages.

Some extra information at [OpenBible](http://www.openbible.info/) and [Bible Databases](https://github.com/scrollmapper/bible_databases)

The current work is started in English and might be extended to other languages later on

In [81]:
import scipy as sp
import numpy as np
import pandas as pd
import csv
import os
import sys
import re
from queue import Queue
from pathlib import Path
import seaborn as sns
import bokeh
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
import networkx as nx
from pyvis.network import Network
import pickle
# for the dict upgrade
from collections import abc



%matplotlib inline

In [82]:
# https://stackoverflow.com/questions/18424228/cosine-similarity-between-2-number-lists
# from scipy.spatial import distance
# scipy.spatial.distance.cosine

# or with numpy:
# from numpy import dot
# from numpy.linalg import norm

# cos_sim = dot(a, b)/(norm(a)*norm(b))

In [83]:
# this is for Tensorflow to use the good GPU instead of the first it sees
# os.environ["CUDA_VISIBLE_DEVICES"]="1"
# and this is to set tf to use the GPU

In [84]:
print(tf.__version__)
print(tf.config.list_physical_devices())


2.3.2
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'), PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU'), PhysicalDevice(name='/physical_device:XLA_GPU:1', device_type='XLA_GPU')]


In [85]:
%%time
# loading data
# load chapters mappings
BASE_PATH_DB = Path("/home/leo/projects/AI/Datasets/text/religion/bible/kaggle-bible-corpus")
KEY_PATH_DB = BASE_PATH_DB / "key_english.csv"

corpus_path = BASE_PATH_DB / "t_asv.csv"
with open(corpus_path, newline='') as f:
    corpus = csv.reader(f)
#     corpus_db = [r for r in corpus]
    corpus_db = list(corpus)
    verses = [r[-1] for r in corpus_db[1:]]  # omit the first line; It's the header

CPU times: user 99.6 ms, sys: 3.91 ms, total: 104 ms
Wall time: 121 ms


In [86]:
verses[0]

'In the beginning God created the heavens and the earth.'

In [87]:
%%time
key_verse_map = {}

with open(KEY_PATH_DB, newline='') as f:
    rows = csv.reader(f)
    kvs = list(rows)
#     kvs = [r for r in rows]
    key_verse_map = { i[0] : i[1] for i in kvs[1:]}

CPU times: user 341 µs, sys: 62 µs, total: 403 µs
Wall time: 401 µs


In [88]:
key_verse_map

{'1': 'Genesis',
 '2': 'Exodus',
 '3': 'Leviticus',
 '4': 'Numbers',
 '5': 'Deuteronomy',
 '6': 'Joshua',
 '7': 'Judges',
 '8': 'Ruth',
 '9': '1 Samuel',
 '10': '2 Samuel',
 '11': '1 Kings',
 '12': '2 Kings',
 '13': '1 Chronicles',
 '14': '2 Chronicles',
 '15': 'Ezra',
 '16': 'Nehemiah',
 '17': 'Esther',
 '18': 'Job',
 '19': 'Psalms',
 '20': 'Proverbs',
 '21': 'Ecclesiastes',
 '22': 'Song of Solomon',
 '23': 'Isaiah',
 '24': 'Jeremiah',
 '25': 'Lamentations',
 '26': 'Ezekiel',
 '27': 'Daniel',
 '28': 'Hosea',
 '29': 'Joel',
 '30': 'Amos',
 '31': 'Obadiah',
 '32': 'Jonah',
 '33': 'Micah',
 '34': 'Nahum',
 '35': 'Habakkuk',
 '36': 'Zephaniah',
 '37': 'Haggai',
 '38': 'Zechariah',
 '39': 'Malachi',
 '40': 'Matthew',
 '41': 'Mark',
 '42': 'Luke',
 '43': 'John',
 '44': 'Acts',
 '45': 'Romans',
 '46': '1 Corinthians',
 '47': '2 Corinthians',
 '48': 'Galatians',
 '49': 'Ephesians',
 '50': 'Philippians',
 '51': 'Colossians',
 '52': '1 Thessalonians',
 '53': '2 Thessalonians',
 '54': '1 Timothy

In [89]:
# TF Universal Sentence Encoder models
#@title Load the Universal Sentence Encoder's TF Hub module
MODELS_BASE_PATH = "/home/leo/projects/AI/Datasets/Tensorflow/tf-hub/"
# module_url = os.path.join(BASE_PATH, "universal-sentence-encoder-lite_2")
module_url = os.path.join(MODELS_BASE_PATH, "universal-sentence-encoder-multilingual_3")
# module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
# module_url = "https://tfhub.dev/google/universal-sentence-encoder-lite/2"
# module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
# module_url = 'https://tfhub.dev/google/universal-sentence-encoder-large/5'

In [90]:
%%time
model = hub.load(module_url)
print ("module %s loaded" % module_url)

def embed(input):
  return model(input)

module /home/leo/projects/AI/Datasets/Tensorflow/tf-hub/universal-sentence-encoder-multilingual_3 loaded
CPU times: user 2.53 s, sys: 364 ms, total: 2.89 s
Wall time: 4.15 s


In [91]:
# I don't get wht this consumes so much CPU and memory and is even slower than the full cpu version
# %%time

# with tf.device('/GPU:1'):
#     model = hub.load(module_url)
#     print ("module %s loaded" % module_url)

#     def embed(input):
#       return model(input)

#     bible_embeddings = embed(verses)

In [92]:
%%time
bible_embeddings = embed(verses)

CPU times: user 2min 24s, sys: 32.8 s, total: 2min 57s
Wall time: 40.8 s


In [93]:
bible_embeddings.shape

TensorShape([31103, 512])

In [94]:
%%time
# similarity by inner product
similarity_matrix_inner = np.inner(bible_embeddings, bible_embeddings)

CPU times: user 10.7 s, sys: 942 ms, total: 11.6 s
Wall time: 4.36 s


In [95]:
similarity_matrix_inner.shape

(31103, 31103)

In [96]:
def check_symmetry(a, rtol=1e-05, atol=1e-08):
    return np.allclose(a, a.T, rtol=rtol, atol=atol)

the matrix must be symmetric

In [97]:
%%time
check_symmetry(similarity_matrix_inner)

CPU times: user 12.2 s, sys: 1.6 s, total: 13.8 s
Wall time: 13.8 s


True

In [98]:
# labels = 

In [99]:
# too big for my PC 
# sns.set(font_scale=1.2)
# g = sns.heatmap(
#   similarity_matrix_inner,
# #       xticklabels=labels,
# #       yticklabels=labels,
#   vmin=0,
#   vmax=1,
#   cmap="YlOrRd")
# g.set_xticklabels(labels, rotation=90)
# g.set_title("Semantic Textual Similarity")


In [100]:
%%time
# get the closest and farthest ~ N for each
# https://www.kite.com/python/answers/how-to-find-the-n-maximum-indices-of-a-numpy-array-in-python
# https://numpy.org/doc/stable/reference/generated/numpy.argpartition.html
# https://numpy.org/doc/stable/reference/generated/numpy.partition.html

n = 21  # such as n>1 , when n==1 it shows only self-similarity
partitions = np.argpartition(similarity_matrix_inner, -n, axis=0)
closests = partitions[-n:]
farthest = partitions[:n]

CPU times: user 19.5 s, sys: 724 ms, total: 20.2 s
Wall time: 20.2 s


In [101]:
len(closests)
closests[0].shape

(31103,)

In [102]:
%%time
# now there are 2 arrays, each with verses number of elements, each array contains 
n_close = np.array(closests).transpose()
n_far = np.array(farthest).transpose()

CPU times: user 1.61 ms, sys: 32 µs, total: 1.65 ms
Wall time: 1.46 ms


In [103]:
similarity_matrix_inner.shape, n_close.shape, n_far.shape

((31103, 31103), (31103, 21), (31103, 21))

In [104]:
n_close[:2]

array([[15337, 31055, 26045, 19212, 24736, 16628, 13772, 26046, 15845,
        18579, 16474, 29973,   148,  5200, 28593,    31, 16625,    16,
           34, 15380,     0],
       [12522,  1910,   182,   190,    27, 14133, 19050,   194, 17829,
            8, 30527,  1928,   149,   176,   177, 25278,   178,     1,
           36,    37, 17530]])

In [105]:
# clean up memory ..
del(closests)
del(farthest)


Show the similarity values for a few points

In [106]:
similarity_matrix_inner[0][n_close[0][:20]]

array([0.5309429 , 0.53220767, 0.535607  , 0.54301083, 0.55235803,
       0.57052946, 0.55402035, 0.5800376 , 0.5822105 , 0.5618566 ,
       0.56626326, 0.57259107, 0.60005933, 0.5615088 , 0.5582243 ,
       0.6272632 , 0.65638304, 0.6330644 , 0.70625603, 0.64020944],
      dtype=float32)

In [107]:
similarity_matrix_inner[0][n_far[0]]

array([ 0.09364344,  0.08305697,  0.06654037,  0.12651277,  0.07592306,
        0.14878714,  0.13647093,  0.13357072,  0.05880683, -0.02358215,
        0.06586871,  0.11249685,  0.14682116,  0.0791003 ,  0.10768902,
        0.11913791,  0.13447869,  0.15217146,  0.11318401,  0.13786386,
        0.05275048], dtype=float32)

In [108]:
# n_close_distance = np.take_along_axis(similarity_matrix_inner, n_close, axis=0)  #this does not work

In [109]:
# n_close_distance = similarity_matrix_inner[tuple(n_close)]  # this does not work either
# n_close_distance = similarity_matrix_inner[n_close]  # this does not work either, it overloads the memory

In [110]:
# tnclose = tuple(n_close)  

In [111]:
# n_far_distance =  similarity_matrix_inner[n_far]  # Nope, this overloads the memory

extract similarity values for the close elements (this will be useful for displaying later the 

In [112]:
%%time
# somehow this is not sorted as it should
close_matrix = np.array([similarity_matrix_inner[i][n_close[i]] for i in range(similarity_matrix_inner.shape[0])])
far_matrix = np.array([similarity_matrix_inner[i][n_far[i]] for i in range(similarity_matrix_inner.shape[0])])

CPU times: user 85 ms, sys: 1.01 ms, total: 86 ms
Wall time: 90.3 ms


In [113]:
close_matrix.shape, far_matrix.shape

((31103, 21), (31103, 21))

In [114]:
close_matrix[0]

array([0.5309429 , 0.53220767, 0.535607  , 0.54301083, 0.55235803,
       0.57052946, 0.55402035, 0.5800376 , 0.5822105 , 0.5618566 ,
       0.56626326, 0.57259107, 0.60005933, 0.5615088 , 0.5582243 ,
       0.6272632 , 0.65638304, 0.6330644 , 0.70625603, 0.64020944,
       1.0000001 ], dtype=float32)

In [115]:
# now I need to understand many MANY things, 
# compute the labels and IDs for each row
# give the text for each row
# create a networkx connection graph with it


Compute Human Readable Labels

In [116]:
corpus_db[0]

['id', 'b', 'c', 'v', 't']

In [117]:
corpus_db[11000:11003]

[['13023016', '13', '23', '16', 'The sons of Gershom: Shebuel the chief.'],
 ['13023017',
  '13',
  '23',
  '17',
  'And the sons of Eliezer were: Rehabiah the chief; and Eliezer had no other sons; but the sons of Rehabiah were very many.'],
 ['13023018', '13', '23', '18', 'The sons of Izhar: Shelomith the chief.']]

In [118]:
# https://stackoverflow.com/a/3233356/4099701

# def map_update(d, u):
#     for k, v in u.items():
#         if isinstance(v, abc.Mapping):
#             d[k] = update(d.get(k, {}), v)
#         else:
#             d[k] = v
#     return d


def deep_update(d, u, depth=-1):
    """
    Recursively merge or update dict-like objects. 
    >>> update({'k1': {'k2': 2}}, {'k1': {'k2': {'k3': 3}}, 'k4': 4})
    {'k1': {'k2': {'k3': 3}}, 'k4': 4}
    """

    for k, v in u.items():
        if isinstance(v, abc.Mapping) and not depth == 0:
            r = deep_update(d.get(k, {}), v, depth=max(depth - 1, -1))
            d[k] = r
        elif isinstance(d, abc.Mapping):
            d[k] = u[k]
        else:
            d = {k: u[k]}
    return d

In [119]:
%%time
# db contains all the information AND the embeddings, this also contains the graph information
bible_db = {}
book_idx = {}

graph_dict = {}

for i in range(1, len(corpus_db)-1):
    verse = corpus_db[i]
#     k_id = int(verse[0])
    v_idx = int(i-1)
    val = {
        'index':v_idx,
        'id': int(verse[0]),
        'name': f"{key_verse_map[verse[1]]} {verse[2]}:{verse[3]}",
        'book_id': int(verse[1]),
        'chapter_id': int(verse[2]),
        'verse_id': int(verse[3]),
        'text': verse[4],
#         'embedding': bible_embeddings[i],
        'close_to': n_close[i],  # ids
        'close_to_distance': close_matrix[i], 
#         'far_from': n_far[i],  # ids
#         'far_from_distance': far_matrix[i], 
    }
    b_idx = { key_verse_map[verse[1]]: {int(verse[2]): {int(verse[3]): {
        'index':v_idx,
        'id': int(verse[0]),
        'name': f"{key_verse_map[verse[1]]} {verse[2]}:{verse[3]}",
    } }} }
    book_idx = deep_update(book_idx, b_idx)
    
    bible_db[v_idx] = val
    # now compute the graph for networkx -> this is missing the types, verses, titles, annotations, and other nice things
    graph_dict[v_idx] = {int(k):1/v  for k,v in zip(n_close[i], close_matrix[i]) if k != v_idx}
    
    
    
    

CPU times: user 2.23 s, sys: 33.4 ms, total: 2.27 s
Wall time: 2.27 s


In [120]:
book_keys = {v.lower():int(k) for k,v in key_verse_map.items()}

bible_db_all = {
    'db': bible_db,
    'book': book_idx,
    'book2key': book_keys,
}

In [121]:
book_keys

{'genesis': 1,
 'exodus': 2,
 'leviticus': 3,
 'numbers': 4,
 'deuteronomy': 5,
 'joshua': 6,
 'judges': 7,
 'ruth': 8,
 '1 samuel': 9,
 '2 samuel': 10,
 '1 kings': 11,
 '2 kings': 12,
 '1 chronicles': 13,
 '2 chronicles': 14,
 'ezra': 15,
 'nehemiah': 16,
 'esther': 17,
 'job': 18,
 'psalms': 19,
 'proverbs': 20,
 'ecclesiastes': 21,
 'song of solomon': 22,
 'isaiah': 23,
 'jeremiah': 24,
 'lamentations': 25,
 'ezekiel': 26,
 'daniel': 27,
 'hosea': 28,
 'joel': 29,
 'amos': 30,
 'obadiah': 31,
 'jonah': 32,
 'micah': 33,
 'nahum': 34,
 'habakkuk': 35,
 'zephaniah': 36,
 'haggai': 37,
 'zechariah': 38,
 'malachi': 39,
 'matthew': 40,
 'mark': 41,
 'luke': 42,
 'john': 43,
 'acts': 44,
 'romans': 45,
 '1 corinthians': 46,
 '2 corinthians': 47,
 'galatians': 48,
 'ephesians': 49,
 'philippians': 50,
 'colossians': 51,
 '1 thessalonians': 52,
 '2 thessalonians': 53,
 '1 timothy': 54,
 '2 timothy': 55,
 'titus': 56,
 'philemon': 57,
 'hebrews': 58,
 'james': 59,
 '1 peter': 60,
 '2 peter'

In [122]:
book_idx

{'Genesis': {1: {1: {'index': 0, 'id': 1001001, 'name': 'Genesis 1:1'},
   2: {'index': 1, 'id': 1001002, 'name': 'Genesis 1:2'},
   3: {'index': 2, 'id': 1001003, 'name': 'Genesis 1:3'},
   4: {'index': 3, 'id': 1001004, 'name': 'Genesis 1:4'},
   5: {'index': 4, 'id': 1001005, 'name': 'Genesis 1:5'},
   6: {'index': 5, 'id': 1001006, 'name': 'Genesis 1:6'},
   7: {'index': 6, 'id': 1001007, 'name': 'Genesis 1:7'},
   8: {'index': 7, 'id': 1001008, 'name': 'Genesis 1:8'},
   9: {'index': 8, 'id': 1001009, 'name': 'Genesis 1:9'},
   10: {'index': 9, 'id': 1001010, 'name': 'Genesis 1:10'},
   11: {'index': 10, 'id': 1001011, 'name': 'Genesis 1:11'},
   12: {'index': 11, 'id': 1001012, 'name': 'Genesis 1:12'},
   13: {'index': 12, 'id': 1001013, 'name': 'Genesis 1:13'},
   14: {'index': 13, 'id': 1001014, 'name': 'Genesis 1:14'},
   15: {'index': 14, 'id': 1001015, 'name': 'Genesis 1:15'},
   16: {'index': 15, 'id': 1001016, 'name': 'Genesis 1:16'},
   17: {'index': 16, 'id': 1001017, 'n

In [123]:
bible_db

{0: {'index': 0,
  'id': 1001001,
  'name': 'Genesis 1:1',
  'book_id': 1,
  'chapter_id': 1,
  'verse_id': 1,
  'text': 'In the beginning God created the heavens and the earth.',
  'close_to': array([12522,  1910,   182,   190,    27, 14133, 19050,   194, 17829,
             8, 30527,  1928,   149,   176,   177, 25278,   178,     1,
            36,    37, 17530]),
  'close_to_distance': array([0.5699584 , 0.57395244, 0.57443845, 0.58023584, 0.581571  ,
         0.5923978 , 0.58679384, 0.59218955, 0.60516644, 0.6061335 ,
         0.6592343 , 0.61467254, 0.6173086 , 0.61120015, 0.6820502 ,
         0.6069061 , 0.6307169 , 0.99999994, 0.60630476, 0.60825175,
         0.6166296 ], dtype=float32)},
 1: {'index': 1,
  'id': 1001002,
  'name': 'Genesis 1:2',
  'book_id': 1,
  'chapter_id': 1,
  'verse_id': 2,
  'text': 'And the earth was waste and void; and darkness was upon the face of the deep: and the Spirit of God moved upon the face of the waters.',
  'close_to': array([31086, 26052,   

In [124]:
graph_dict

{0: {12522: 1.7545140481149177,
  1910: 1.742304651915337,
  182: 1.7408305367856611,
  190: 1.7234371491304032,
  27: 1.7194805610085413,
  14133: 1.6880548590139681,
  19050: 1.7041760358574256,
  194: 1.6886485067575265,
  17829: 1.6524379770014954,
  8: 1.6498015140374893,
  30527: 1.5169113958660108,
  1928: 1.6268824980649685,
  149: 1.6199352690813307,
  176: 1.636125242667982,
  177: 1.4661676791136236,
  25278: 1.6477013060543428,
  178: 1.5854973419139542,
  1: 1.0000000596046483,
  36: 1.6493355456579002,
  37: 1.6440560988580732,
  17530: 1.6217190987081318},
 1: {31086: 1.8963215881288202,
  26052: 1.8911938930678158,
  15: 1.8576479978971212,
  14: 1.7016540022179942,
  16: 1.8513784961464963,
  17: 1.622104346075669,
  15807: 1.8346428200799219,
  400: 1.8323899728263244,
  26616: 1.7006004582044072,
  8631: 1.8154572557136608,
  29312: 1.772800807466188,
  14447: 1.8543874206131348,
  26053: 1.8087359408895842,
  22772: 1.8903004397649554,
  18822: 1.7438276654380238,
 

In [125]:
bible_db[1000]

{'index': 1000,
 'id': 1034020,
 'name': 'Genesis 34:20',
 'book_id': 1,
 'chapter_id': 34,
 'verse_id': 20,
 'text': 'And Hamor and Shechem his son came unto the gate of their city, and communed with the men of their city, saying,',
 'close_to': array([19699, 19101,  5109, 17930,   996,  5185,  3514, 11800, 19309,
        12387, 12534, 19753,  3310, 22481, 12535, 12239, 12249, 19339,
        12579,  6574,  1001]),
 'close_to_distance': array([0.5899422 , 0.5920398 , 0.5913451 , 0.59335846, 0.6044415 ,
        0.5945562 , 0.5993583 , 0.62606585, 0.61219525, 0.6091775 ,
        0.5973339 , 0.61750627, 0.5975029 , 0.628181  , 0.6604283 ,
        0.68746686, 0.66101223, 0.68151104, 0.71506846, 0.6595216 ,
        0.99999976], dtype=float32)}

In [126]:
# save the DB

BIBLE_DB_PATH = "../db/bible-db.pkl"
with open(BIBLE_DB_PATH, 'wb') as f:
    pickle.dump(bible_db_all, f, pickle.HIGHEST_PROTOCOL)
    
## kind of big, 86 mb

In [127]:
BIBLE_EMBEDDINGS_PATH = "../db/bible-embeddings.pkl"
with open(BIBLE_EMBEDDINGS_PATH, 'wb') as f:
    pickle.dump(bible_embeddings, f, pickle.HIGHEST_PROTOCOL)
    

In [128]:
# function to get the subgraph 

def _get_edges(n_index, node, close_points):
    """
    n_index: node index in the encoding matrix
    node: the node in the db
    close_points: the number of close points to get
    """
    nd_weight = zip(node['close_to'], node['close_to_distance'])
    nd_edges = []
    for n,w in nd_weight:
        if n != n_index:
            # distance is farthest the smaller it is, so changing it to make the reverse relation
            nd_edges.append((n_index, n, 1/w))
        if len(nd_edges) >= close_points:
            break
    return nd_edges


def get_subgraph(bible_db, node_id, close_points=5, levels=2):
    """
    Get the subgraph from a node id
    bible_db: the entire bible database in a dict
    node_id: the node id
    close_points: the number of close points from each, => each point will have at most close_points outgoing edges
    levels: number of levels to go in depth for connections
    returns a networkx graph of the subgraph from the complete db centering the subgraph in the given node_id
    """
    g = nx.Graph()
    if node_id not in bible_db:
        # there is no graph to build
        return g
    # Recursive is SO intuitive, but will explode the stack and memory for big graphs
    nid = node_id
    nodes_to_add = []  # (node_id, group, size, label, title, txt)
    edges_to_add = []  # (node_id, node_id, weight)
    discovered = set()
    q = Queue()
    q.put((node_id, 0))  # keep (node, depth from center in levels)
    cnt = 0
    # this tree/ graph transversal is not WOW HOW EFFICIENT, but it works well enough
    while not q.empty() and cnt < levels+1:
        node_id, lvl = q.get()
        discovered.add(node_id)
        node = bible_db[node_id]
        nodes_to_add.append((node, lvl))
        # only add the edges if the level is not the max
        if lvl < levels:
            nd_edges = _get_edges(node_id, node, close_points)
            for edg in nd_edges:
                edges_to_add.append((edg, lvl))
                sn, en, w = edg
                if en not in discovered:
                    q.put((en, lvl+1))
        cnt = lvl
    # TODO
    for node, lvl in nodes_to_add:
#         if lvl < levels:
# #             g.add_node(nid)
#         g.add_node(node['index'], size=20, group=node['book_id'], title=node['name'], data=node['text'])
        g.add_node(node['index'], size=60/(lvl+1), group=node['book_id'])
#         g.add_node(int(nid), size=20, group=node['book_id'], title=node['name'], data=node['text'])
    
    for edg, lvl in edges_to_add:
        (sn, en, w) = edg
#         print(edg)
        # pyvis complains that this are not int fields!! (but they are)
        g.add_edge(int(sn), int(en), weight=w)
#         g.add_edge(sn, en)

    return nodes_to_add, edges_to_add, g

In [129]:
%%time
nodes, edges, sg = get_subgraph(bible_db, 31007, close_points=2, levels=3)

CPU times: user 422 µs, sys: 78 µs, total: 500 µs
Wall time: 509 µs


In [130]:
nodes2, edges2, sg2 = get_subgraph(bible_db, 1024, close_points=3, levels=2)

In [131]:
nodes3, edges3, sg3 = get_subgraph(bible_db, 11000, close_points=2, levels=3)

In [132]:
# cg3 = nx.compose(sg,sg2)
cg3=nx.Graph()
cg3.add_edges_from(sg.edges(data=True))
cg3.add_edges_from(sg2.edges(data=True))
cg3.add_edges_from(sg3.edges(data=True))
cg3.add_nodes_from(sg.nodes(data=True)) #deals with isolated nodes
cg3.add_nodes_from(sg2.nodes(data=True))
cg3.add_nodes_from(sg3.nodes(data=True))
cg3.add_edge(6674,3904)
# cg3.add_edge(11237,9468)

In [133]:
# [(n[0]['index'], n[1]) for n in nodes]

In [134]:
# edges

In [135]:
type(sg)

networkx.classes.graph.Graph

In [136]:
# sg.nodes[127]

In [137]:
# options = {
#     'node_color': 'yellow',
#     'node_size': 600,
# #     'width': 3,
# #     'arrowstyle': '-|>',
# #     'arrowsize': 12,
# }
# nx.draw_networkx(sg, **options)
# plt.show()

In [138]:
sg.graph

{}

In [139]:
options = {
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -8892,
      "springLength": 405,
      "springConstant": 0.06,
      "damping": 0.04,
      "avoidOverlap": 1
    },
    "minVelocity": 0.75,
    "timestep": 0.12
  }
}

In [140]:
%%time
nt = Network(height='400px', width='50%', bgcolor='#2222', font_color='white', notebook=True)
nt.from_nx(sg3)
# nt.barnes_hut()
# nt.force_atlas_2based()
nt.repulsion()
# nt.repulsion
# nt.enable_physics(True)


CPU times: user 6.83 ms, sys: 525 µs, total: 7.36 ms
Wall time: 7.21 ms


In [141]:
%%time
nt.show('background-right.html')

CPU times: user 576 µs, sys: 106 µs, total: 682 µs
Wall time: 621 µs


In [142]:
def get_closest_points(txt, n=5, algorithm='inner'):
    """
    txt: the text to look for similarities
    n: the number of closest matches that will be searched
    algorithm: inner|cosine  # the algorithm to determine how the proximity is computed
    returns the closest n points to the input text based on the proximity algorithm
    """
    # TODO
    # compute input embedding 
    embd = embed([txt])
    # compute proximity with all the existing points
    similarity = np.inner(bible_embeddings, embd)
#     print(similarity.shape)
    # get the closest n points ids
    # such as n>1 , when n==1 it shows only self-similarity
    partitions = np.argpartition(similarity, -n, axis=0)
#     print(partitions.shape)
    n_close = partitions[-n:]
    n_far = partitions[:n]
    # needs a complete matrix
    return n_close, n_far


In [143]:
%%time
search_results = get_closest_points("and god is good")

CPU times: user 28 ms, sys: 41.3 ms, total: 69.3 ms
Wall time: 19.3 ms


In [144]:
search_results[0].shape

(5, 1)

In [145]:
closest = search_results[0][0][0]

In [146]:
closest

15021

In [147]:
%%time
nodes, edges, sg2 = get_subgraph(bible_db, closest, close_points=5, levels=3)

CPU times: user 4.69 ms, sys: 7.36 ms, total: 12 ms
Wall time: 2.56 ms


In [148]:
len(nodes)

156

In [149]:
%%time
nt = Network('800px', '800px', notebook=True)
nt.from_nx(sg2)
nt.show('search-results.html')

CPU times: user 12.2 ms, sys: 7.28 ms, total: 19.5 ms
Wall time: 15.6 ms


try to build an entire graph directly in networkX instead fo recomputing it each time.

And later use this graph to do the search, might be easier and faster than redoing it all each time

In [150]:
%%time
graph_db = nx.Graph(graph_dict)

CPU times: user 1.13 s, sys: 35.9 ms, total: 1.16 s
Wall time: 1.2 s


In [151]:
# %%time

# gnt = Network('800px', '1200px', notebook=True)
# gnt.from_nx(graph_db)
# gnt.show('complete_bible_graph.html')

Use the NetworkX complete graph to do search and transversals, they should be much more optimized than my ad-hoc code

In [152]:
%%time
#this should work but does not restrict to a max n_closests from each node
cg = nx.ego_graph(graph_db, int(closest), radius=2)  #try if this works, if not go for the next two lines
# center_node = G[closest]
# nx.ego_graph(graph_db, center_node, radius=3)
# todo now trim this subgraph to the closest only

CPU times: user 29.3 ms, sys: 0 ns, total: 29.3 ms
Wall time: 29.5 ms


In [153]:
len(cg.nodes)

763

In [154]:
# %%time

# gnt = Network('650px', '1000px', notebook=True)
# gnt.from_nx(cg)
# gnt.show('restricted_graph_from_nx_search.html')

### NetworkX subgraph computation VS manual algorithm

advantage of networkx search:
- one line of code, 
- already implemented
- someone else maintains it

problems of networkx search
- coloring still not done I have to find how this works
- loops are computed, this might be the main problem -> take out the loops! -> even taking teh loops out this is slower

Advantage of manual approach: amazingly it seems to be much faster!! (3->5 times!)

### Some random graphs

for the website

In [155]:
g1 = nx.fast_gnp_random_graph(15, 0.2)

In [156]:
%%time
nt = Network('400px', '400px', notebook=True)
nt.from_nx(g1)
nt.show('rand.html')

CPU times: user 8.25 ms, sys: 0 ns, total: 8.25 ms
Wall time: 8.03 ms


In [157]:
# ga = nx.graph_atlas_g()

In [158]:
# options = {
#     'node_color': 'yellow',
#     'node_size': 600,
#     'width': 3,
#     'arrowstyle': '-|>',
#     'arrowsize': 12,
# }
# for g in ga:
#     nx.draw_networkx(g, **options)
#     plt.show()

In [159]:
got_net = Network(height='450px', width='100%', bgcolor='#222222', font_color='white', notebook=True)

# set the physics layout of the network
got_net.barnes_hut()
got_data = pd.read_csv('https://www.macalester.edu/~abeverid/data/stormofswords.csv')

sources = got_data['Source']
targets = got_data['Target']
weights = got_data['Weight']

edge_data = zip(sources, targets, weights)

for e in edge_data:
    src = e[0]
    dst = e[1]
    w = e[2]

    got_net.add_node(src, src, title=src)
    got_net.add_node(dst, dst, title=dst)
    got_net.add_edge(src, dst, value=w)

neighbor_map = got_net.get_adj_list()

# add neighbor data to node hover data
for node in got_net.nodes:
    node['title'] += ' Neighbors:<br>' + '<br>'.join(neighbor_map[node['id']])
    node['value'] = len(neighbor_map[node['id']])
got_net.show_buttons(filter_=['physics'])
got_net.show('gameofthrones.html')

In [160]:
got_net.show_buttons(filter_=['physics'])
