In [7]:
import pandas as pd
import numpy as np
import nltk 
import spacy
import textdistance
import re
import math

from nltk import tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.util import ngrams

# nltk.download('wordnet')

full_cr = pd.read_csv('../DirtyData/full_cross_ref.csv')
final_cr = full_cr.copy()
full_cr.fillna('NA', inplace=True)

bd = pd.read_csv('../CleanData/Bible_Dictionary.csv')

stop_words = list(set(stopwords.words('english'))) # useless words, used in function later

## Step 1: Clean scripture verses (Pre-Processing)

In [8]:
def process_verses(verse_column):
    
    new_column = []
    
    for verse in verse_column:
        if verse != 'NA':
            verse = verse.replace('-', ' ')
            verse = re.sub(r'[^a-zA-Z0-9\s]', '', verse)   
            verse = verse.lower() 
            verse = ' '.join([word for word in verse.split() if word not in stop_words])
            verse = re.sub(r'\d', '', verse)
            # Certain words are spelled differently between books, such as "woe" and "wo"
            verse = re.sub(r'\bwoe\b', 'wo', verse)
            verse = re.sub(r'\bshew\b', 'show', verse)
            verse = re.sub(r'\bvail\b', 'veil', verse)
            verse = re.sub(r'\bvails\b', 'veils', verse)
            # Append new verse
            new_column.append(verse)

        else:
            new_column.append('NA')

    return new_column

full_cr['scripture_text_ISH'] = process_verses(full_cr['scripture_text_ISH'])
final_cr['scripture_cleaned_ISH'] = full_cr['scripture_text_ISH']
full_cr['scripture_text_BOM'] = process_verses(full_cr['scripture_text_BOM'])
final_cr['scripture_cleaned_BOM'] = full_cr['scripture_text_BOM']
bd['term_simple']  = process_verses(bd['Term'])

## Step 2: Find Text Similarities

In [9]:
text_sim = []

for i in full_cr.index:
    BOM_value = full_cr.loc[i, ['scripture_text_BOM']][0]
    if 'NA' not in BOM_value:
        ISH_verse = full_cr.loc[i, ['scripture_text_ISH']][0]
        ISH_words = ISH_verse.split()
        BOM_verse = full_cr.loc[i, ['scripture_text_BOM']][0]
        BOM_words = BOM_verse.split()
        cosine_sim_dist = textdistance.cosine(ISH_words, BOM_words)
    else:
        cosine_sim_dist = 0
    text_sim.append(cosine_sim_dist)

final_cr['similarity_score'] = pd.Series(text_sim)



In [10]:
### categories for similarity_score

def categorize_similarity(score):
    if score >= .75:
        return "Direct Quote"
    elif .25 <= score < .75:
        return "Shared Language"
    else: 
        return "Similar Theme"
    
temp = pd.DataFrame(final_cr[final_cr['scripture_text_BOM'].notna() & final_cr['scripture_text_ISH'].notna()]['similarity_score'].apply(categorize_similarity))
temp.rename(columns = {'similarity_score': 'similarity_category'}, inplace=True)
final_cr = final_cr.join(temp)

## Step 3: Count Words

In [15]:
final_cr['word_count_ISH'] = final_cr['scripture_text_ISH'].apply(lambda verse: len(str(verse).split()) if not pd.isna(verse) else 0)
final_cr['word_count_BOM'] = final_cr['scripture_text_BOM'].apply(lambda verse: len(str(verse).split()) if not pd.isna(verse) else 0)

## Step 4: Create Logical Vector if Bible Dictionary Term is Found Within BoM Verse

In [16]:
bd_found_ISH = []
bd_found_BOM = []

lemmatizer = WordNetLemmatizer()
for ISH_verse in full_cr['scripture_text_ISH']:
    verse_words = ISH_verse.split()
    found_word = False

    for word in verse_words:
        lemma = lemmatizer.lemmatize(word, 'n')
        
        if lemma in bd['term_simple'].values:
            found_word = True
            break  

    if found_word:
        bd_found_ISH.append(True)

    else:
        bd_found_ISH.append(False)

for BOM_verse in full_cr['scripture_text_BOM']:
    verse_words = BOM_verse.split()
    found_word = False

    for word in verse_words:
        lemma = lemmatizer.lemmatize(word, 'n')
        
        if lemma in bd['term_simple'].values:
            found_word = True
            break  

    if found_word:
        bd_found_BOM.append(True)

    else:
        bd_found_BOM.append(False)

final_cr['bible_term_in_ISH'] = bd_found_ISH
final_cr['bible_term_in_BOM'] = bd_found_BOM

## Step 5: Classify Isaiah Chapters
CH 1-39: First (Proto) Isaiah 

CH 40-55: Second (Deutero) Isaiah 

CH 56-66: Third (Trito) Isaiah 

In [17]:
def Duhms_Classification(ch_num):
    if math.isnan(ch_num):
        return np.nan
    else:    
        if (ch_num >=1) & (ch_num <= 39):
            return 'Proto'
        elif (ch_num >= 40) & (ch_num <= 55):
            return 'Deutero'
        elif (ch_num >= 56) & (ch_num <= 66):
            return 'Trito'
    

Duhms_division = []

for ch_num in final_cr['chapter_number_ISH']:
    Duhms_division.append(Duhms_Classification(ch_num))

final_cr['Duhms_Class'] = Duhms_division

## Step 6: Export to XLSX and CSV

In [18]:
final_cr.to_csv('../CleanData/by_verse_cross_ref.csv')
final_cr.to_excel('../CleanData/by_verse_cross_ref.xlsx')