# Load Libraries

In [45]:
from nltk import edit_distance
from scipy.spatial.distance import pdist, squareform
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import seaborn as sns
from matplotlib.colors import LogNorm, Normalize
from Levenshtein import distance

# Load Verses

In [46]:
txt_dir = "cleaned"

dir = os.listdir(txt_dir)

languages = {}

for filename in dir:
    match = re.match("([a-z]*(?:-a)?)(?:-(?:tl))?_cleaned\\.verses\\.csv", filename)
    if match is not None:
        language = (match[1])
        path = os.path.join(txt_dir, filename)
        print(path)
        languages[language] = pd.read_csv(path, quotechar='"')

cleaned\bantoanon_cleaned.verses.csv
cleaned\bicol_cleaned.verses.csv
cleaned\cebuano_cleaned.verses.csv
cleaned\chavacano_cleaned.verses.csv
cleaned\english_cleaned.verses.csv
cleaned\filipino-tl_cleaned.verses.csv
cleaned\ilokano_cleaned.verses.csv
cleaned\ilonggo_cleaned.verses.csv
cleaned\ivatan_cleaned.verses.csv
cleaned\kinaray-a_cleaned.verses.csv
cleaned\manobo_cleaned.verses.csv
cleaned\masbatenyo_cleaned.verses.csv
cleaned\pampanga_cleaned.verses.csv
cleaned\pangasinan_cleaned.verses.csv
cleaned\romblomanon_cleaned.verses.csv
cleaned\sambal_cleaned.verses.csv
cleaned\spanish_cleaned.verses.csv
cleaned\tausug_cleaned.verses.csv
cleaned\waray_cleaned.verses.csv
cleaned\yakan_cleaned.verses.csv
cleaned\yami_cleaned.verses.csv


# Remove Unneeded Books

In [47]:
books = languages['english']['Book_Code'].unique()
books

array(['GEN', 'EXO', 'LEV', 'NUM', 'DEU', 'JOS', 'JDG', 'RUT', '1SA',
       '2SA', '1KI', '2KI', '1CH', '2CH', 'EZR', 'NEH', 'EST', 'JOB',
       'PSA', 'PRO', 'ECC', 'SNG', 'ISA', 'JER', 'LAM', 'EZK', 'DAN',
       'HOS', 'JOL', 'AMO', 'OBA', 'JON', 'MIC', 'NAM', 'HAB', 'ZEP',
       'HAG', 'ZEC', 'MAL', 'MAT', 'MRK', 'LUK', 'JHN', 'ACT', 'ROM',
       '1CO', '2CO', 'GAL', 'EPH', 'PHP', 'COL', '1TH', '2TH', '1TI',
       '2TI', 'TIT', 'PHM', 'HEB', 'JAS', '1PE', '2PE', '1JN', '2JN',
       '3JN', 'JUD', 'REV'], dtype=object)

In [48]:
languages_books_filtered = {}

for language in languages:
    languages_books_filtered[language] = languages[language][languages[language]['Book_Code'].isin(books)].dropna()

languages_books_filtered['english']

Unnamed: 0,Book_Code,Chapter,Verse,Text
0,GEN,1,1,In the beginning God created the heaven and th...
1,GEN,1,2,"And the earth was without form, and void; and ..."
2,GEN,1,3,"And God said, Let there be light: and there wa..."
3,GEN,1,4,"And God saw the light, that it was good: and G..."
4,GEN,1,5,"And God called the light Day, and the darkness..."
...,...,...,...,...
31213,REV,22,17,"And the Spirit and the bride say, Come. And le..."
31214,REV,22,18,For I testify unto every man that heareth the ...
31215,REV,22,19,and if any man shall take away from the words ...
31216,REV,22,20,"He which testifieth these things saith, Surely..."


# Create Parallel Corpora

In [49]:

languages_bcv_map = {} # BCV = Book, Chapter, Verse
for language in languages_books_filtered:
    languages_bcv_map[language] = {}

    for row in languages_books_filtered[language].itertuples():
        if row[1] not in languages_bcv_map[language]:
            languages_bcv_map[language][row[1]] = {}
        if row[2] not in languages_bcv_map[language][row[1]]:
            languages_bcv_map[language][row[1]][row[2]] = []
        languages_bcv_map[language][row[1]][row[2]].append((row[3], row[4]))

In [50]:
parallel_corpora = []
parallel_corpora_done = {}
max = 0

corpora_counter = 0
for language in languages_bcv_map:
    for language2 in languages_bcv_map:
        if language == language2:
            continue
        if (language2, language) in parallel_corpora_done:
            continue
        if (language, language2) not in parallel_corpora_done and (language2, language) not in parallel_corpora_done:
            parallel_corpora_done[(language, language2)] = True
        else:
            continue
        corpora_counter += 1
        print((language, language2, corpora_counter))
        for book in languages_bcv_map[language]:
            if book not in languages_bcv_map[language2]:
                continue
            for chapter in languages_bcv_map[language][book]:
                if chapter not in languages_bcv_map[language2][book]:
                    continue
                if re.match("([0-9]*)-?([0-9]*)?", str(languages_bcv_map[language][book][chapter][0][0]))[1] != '1':
                    continue
                if re.match("([0-9]*)-?([0-9]*)?", str(languages_bcv_map[language2][book][chapter][0][0]))[1] != '1':
                    continue
                min = 1
                left_index = 0
                right_index = 0
                left_max = 1
                right_max = 1
                left_curr = ""
                right_curr = ""
                left_update = True
                right_update = True
                while left_max != right_max or left_index <= len(languages_bcv_map[language][book][chapter]) - 1 or right_index <= len(languages_bcv_map[language2][book][chapter]) - 1:
                    if left_index > len(languages_bcv_map[language][book][chapter]) - 1 or right_index > len(languages_bcv_map[language2][book][chapter]) - 1:
                        break
                    left_verse = languages_bcv_map[language][book][chapter][left_index]
                    right_verse = languages_bcv_map[language2][book][chapter][right_index]
                    if '-' in str(left_verse[0]):
                        left_max = int(re.match('[0-9]*-([0-9]*)', str(left_verse[0]))[1])
                    else:
                        left_max = int(left_verse[0])
                    if left_update:
                        left_curr += left_verse[1]

                    if '-' in str(right_verse[0]):
                        right_max = int(re.match('[0-9]*-([0-9]*)', str(right_verse[0]))[1])
                    else:
                        right_max = int(right_verse[0])
                    if right_update:
                        right_curr += right_verse[1]

                    if left_max < right_max:
                        left_index += 1
                        left_update = True
                    elif left_max > right_max:
                        right_index += 1
                        right_update = True
                    else:
                        # we synced, push this
                        left_update = right_update = True
                        left_index += 1
                        right_index += 1
                        parallel_corpora.append((language, language2, book, chapter, str(min)+'-'+str(left_max) if min != left_max else str(min), left_curr, right_curr))
                        left_curr = ""
                        right_curr = ""
                        min = left_max + 1
                    

('bantoanon', 'bicol', 1)
('bantoanon', 'cebuano', 2)
('bantoanon', 'chavacano', 3)
('bantoanon', 'english', 4)
('bantoanon', 'filipino', 5)
('bantoanon', 'ilokano', 6)
('bantoanon', 'ilonggo', 7)
('bantoanon', 'ivatan', 8)
('bantoanon', 'kinaray-a', 9)
('bantoanon', 'manobo', 10)
('bantoanon', 'masbatenyo', 11)
('bantoanon', 'pampanga', 12)
('bantoanon', 'pangasinan', 13)
('bantoanon', 'romblomanon', 14)
('bantoanon', 'sambal', 15)
('bantoanon', 'spanish', 16)
('bantoanon', 'tausug', 17)
('bantoanon', 'waray', 18)
('bantoanon', 'yakan', 19)
('bantoanon', 'yami', 20)
('bicol', 'cebuano', 21)
('bicol', 'chavacano', 22)
('bicol', 'english', 23)
('bicol', 'filipino', 24)
('bicol', 'ilokano', 25)
('bicol', 'ilonggo', 26)
('bicol', 'ivatan', 27)
('bicol', 'kinaray-a', 28)
('bicol', 'manobo', 29)
('bicol', 'masbatenyo', 30)
('bicol', 'pampanga', 31)
('bicol', 'pangasinan', 32)
('bicol', 'romblomanon', 33)
('bicol', 'sambal', 34)
('bicol', 'spanish', 35)
('bicol', 'tausug', 36)
('bicol', 'war

In [51]:
df = pd.DataFrame(data=parallel_corpora, columns=['language1', 'language2', 'book', 'chapter', 'verse', 'language1_text', 'language2_text'])

In [52]:
df

Unnamed: 0,language1,language2,book,chapter,verse,language1_text,language2_text
0,bantoanon,bicol,GEN,1,1,"Sa kauna-unahan, gingtuga it Dios kag langit a...","Sa kapinonan, kan lalangon nin Dios an kalangi..."
1,bantoanon,bicol,GEN,1,2,"It kato, kag kalibutan ay waya it korte ag way...",an kinaban mayo nin ano man na porma saka laog...
2,bantoanon,bicol,GEN,1,3,"Ag nagsiling kag Dios, ""Magkainggwa't hadag!"" ...","Nagboot an Dios, ""Magkaigwa nin liwanag,"" asin..."
3,bantoanon,bicol,GEN,1,4,"Nakita it Dios nak maado kag hadag, ag Ida ing...",Nahiling nin Dios na marahay an liwanag. Isinu...
4,bantoanon,bicol,GEN,1,5,"Gingtawag it Dios kag hadag nak ""adlaw"" ag kag...","dangan inapod niyang ""Aldaw"" an liwanag, asin ..."
...,...,...,...,...,...,...,...
2636012,yakan,yami,REV,22,17,"Nambag Niyawa Tuhanin duk pangantin dendehin, ...",Maniring o Seyzi a kano Macivahavahay a mavake...
2636013,yakan,yami,REV,22,18,"Aku inin si Yahiya. Sessa'ante ka'am, sasuku k...",Yaken rana am si Yowani ko. Ko pa pakoyokoin j...
2636014,yakan,yami,REV,22,19,Bang niya' ngānan sinduwe me' pa'alan dem libr...,No amyan so tao a komteb do ciring a yadomket ...
2636015,yakan,yami,REV,22,20,Inin ne bissāmanaksi' si kēmon pinakita'u dem ...,Si Yeso a yamacyanoyong do aro a ciring eya am...


In [53]:
df.loc[df['language1'] == 'english'].head(5)
len(df)

df.to_csv('parallel_corpora.csv')