# Playing with the DataSet

### Creating Sample File

In [4]:
import pandas as pd
import random
import csv

In [5]:
bible_data = pd.read_csv('datasets/bible_data_set.csv')
size = len(bible_data)

random_indices = random.sample(range(size), 100)

sample_data = bible_data.iloc[random_indices]
sample_data.to_csv('datasets/sample_bible_data.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

In [6]:
sample_data = pd.read_csv('datasets/sample_bible_data.csv')
print(sample_data.head())

         citation      book  chapter  verse  \
0    Hebrews 3:14   Hebrews        3     14   
1   Matthew 13:11   Matthew       13     11   
2  Proverbs 18:16  Proverbs       18     16   
3      James 2:11     James        2     11   
4   Matthew 15:31   Matthew       15     31   

                                                text  
0  For we are made partakers of Christ, if we hol...  
1  He answered and said unto them, Because it is ...  
2  A man's gift maketh room for him, and bringeth...  
3  For he that said, Do not commit adultery, said...  
4  Insomuch that the multitude wondered, when the...  


### Creating sentence embedding

We are not going to care about efficiency, but just do it
Because there is already a library called SentenceTransformer, we are not going to be bothered using deBERTa. If we need to do more customizing, we can always do it later.

In [7]:
from transformers import AutoTokenizer, AutoModel
import torch
from sentence_transformers import SentenceTransformer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model = SentenceTransformer('all-MiniLM-L12-v2')
embeddings = np.array(model.encode(sample_data['text']))

# saving the numbers so it can be loaded up easily later
np.save('datasets/sample_bible_embeddings.npy', embeddings)

  return forward_call(*args, **kwargs)


In [16]:
print(-float(model.similarity(embeddings[0], embeddings[1])))
print(sample_data['citation'][1])

-0.31170448660850525
Matthew 13:11


### Using Sentence Embedding to compare each other

In [35]:
import heapq
from collections import defaultdict

In [None]:
# we are going to store top 10 similar verses for each verse
# also save top 100 similar verses in a list

# make sure that a separate list is created for each verse
verses_top = defaultdict(list)
overall_top = []

print(verses_top)

embeddings = np.load('datasets/sample_bible_embeddings.npy')

defaultdict(<class 'list'>, {})


In [37]:
greatest_similarity = -1.0

# comparing each verse with every other verse
for i in range(len(sample_data)):

    verse1 = sample_data['citation'][i]

    for j in range(i + 1, len(sample_data)):
        similarity = float(model.similarity(embeddings[i], embeddings[j]))

        verse2 = sample_data['citation'][j]

        greatest_similarity = max(greatest_similarity, similarity)

        # we keep adding numbers and removing the smallest similarity
        item = (similarity, verse1, verse2)
        heapq.heappush(overall_top, item)

        if len(overall_top) > 10:
            heapq.heappop(overall_top)

        item = (similarity, verse2)
        heapq.heappush(verses_top[verse1], item)

        if len(verses_top[verse1]) > 10:
            heapq.heappop(verses_top[verse1])

        item = (similarity, verse1)
        heapq.heappush(verses_top[verse2], item)

        if len(verses_top[verse2]) > 10:
            heapq.heappop(verses_top[verse2])

        

In [13]:
import pandas as pd

In [43]:
# putting the results into a DataFrame
print(greatest_similarity)


for item in reversed(sorted(overall_top)):
    print(item[0])
    print(f"{item[1]}  {sample_data.loc[sample_data['citation'] == item[1], 'text'].values[0]}")
    print(f"{item[2]}  {sample_data.loc[sample_data['citation'] == item[2], 'text'].values[0]}")
    print()


0.6832425594329834
0.6832425594329834
Matthew 21:20  And when the disciples saw it, they marvelled, saying, How soon is the fig tree withered away! 

John 1:48  Nathanael saith unto him, Whence knowest thou me? Jesus answered and said unto him, Before that Philip called thee, when thou wast under the fig tree, I saw thee. 


0.6162037253379822
Ezekiel 22:15  And I will scatter thee among the heathen, and disperse thee in the countries, and will consume thy filthiness out of thee. 

2 Samuel 22:50  Therefore I will give thanks unto thee, O LORD, among the heathen, and I will sing praises unto thy name. 


0.6031732559204102
Ezekiel 22:15  And I will scatter thee among the heathen, and disperse thee in the countries, and will consume thy filthiness out of thee. 

Exodus 34:24  For I will cast out the nations before thee, and enlarge thy borders: neither shall any man desire thy land, when thou shalt go up to appear before the LORD thy God thrice in the year. 


0.5913217067718506
Levitic