In [0]:
import os
import time
import pandas as pd
import numpy as np
import nltk
import pickle
import json
import sys
sys.setrecursionlimit(10000000)
import re ### to use extended regex library: https://pypi.org/project/regex/
import ast
import csv
from bs4 import BeautifulSoup
import requests
import getpass
from urllib.request import urlopen
from google.colab import files

!pip install --ignore-installed --index-url https://test.pypi.org/simple/ --no-deps sddk ### our own package under construction, always install to have up-to-date version
import sddk

Looking in indexes: https://test.pypi.org/simple/
Collecting sddk
  Using cached https://test-files.pythonhosted.org/packages/65/8b/d682c15a7335215ac119538ad8455b408cd7e8be4f6614678888dd2c88ed/sddk-0.0.7-py3-none-any.whl
Installing collected packages: sddk
Successfully installed sddk-0.0.7


# sciencedata.dk configuration

As a data storage, we will use Danish official infrastructure called sciencedata.dk. It is commobly approacheble from any European research institution.

The configuration require having already a proper username and password configurated on the web interface of the platform.

In [0]:
s, sciencedata_groupurl = sddk.configure_session_and_url("direct_root")

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ··········
personal connection established
group connection established with you as owner
endpoint for requests has been configured to: https://sciencedata.dk/files/direct_root/


# Importing back the data

In [0]:
### get list of Jesus sayings from canonical Gospels (dictionary-like format)
gospels_saying_verses = json.loads(s.get(sciencedata_groupurl + "dirgot_data/gospels_saying_verses.json").content)
gospels_saying_verses["matt"][:5]

['"Let it be so now; for thus it is fitting for us to fulfil all righteousness."',
 '"It is written, \'Man shall not live by bread alone, but by every word that proceeds from the mouth of God.\'"',
 '"Again it is written, \'You shall not tempt the Lord your God.\'"',
 '"Begone, Satan! for it is written, \'You shall worship the Lord your God and him only shall you serve.\'"',
 '"Repent, for the kingdom of heaven is at hand."']

In [0]:
### get list of Jesus sayings from the Gospel of Thomas
gth_sayings_long = ast.literal_eval(s.get(sciencedata_groupurl + "dirgot_data/gth_sayings_manually.txt").text)
gth_sayings_long[:5]

['Whoever finds the interpretation of these sayings will not experience death.',
 'Let him who seeks continue seeking until he finds. When he finds, he will become troubled. When he becomes troubled, he will be astonished, and he will rule over the All.',
 "If those who lead you say, 'See, the Kingdom is in the sky,' then the birds of the sky will precede you. If they say to you, 'It is in the sea,' then the fish will precede you. Rather, the Kingdom is inside of you, and it is outside of you. When you come to know yourselves, then you will become known, and you will realize that it is you who are the sons of the living Father. But if you will not know yourselves, you dwell in poverty and it is you who are that poverty.",
 'The man old in days will not hesitate to ask a small child seven days old about the place of life, and he will live. For many who are first will become last, and they will become one and the same.',
 'Recognize what is in your sight, and that which is hidden from yo

In [0]:
gth_sayings = []
for saying in gth_sayings_long:
  for sentence in saying.split(". "):
    if len(sentence) > 1:
      gth_sayings.append(sentence)

In [0]:
### make one dictionary for all gospels
gospels_saying_verses.update({"thom" :  gth_sayings})
gospels_saying_verses.keys()

dict_keys(['matt', 'mark', 'luke', 'john', 'thom'])

In [0]:
gospels_saying_verses["thom"][:5]

['Whoever finds the interpretation of these sayings will not experience death.',
 'Let him who seeks continue seeking until he finds',
 'When he finds, he will become troubled',
 'When he becomes troubled, he will be astonished, and he will rule over the All.',
 "If those who lead you say, 'See, the Kingdom is in the sky,' then the birds of the sky will precede you"]

# Text Cleaning, Tokenization and Lemmatization

In [0]:
import nltk
nltk.download("popular")
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

In [0]:
to_replace= {
### archaisms
    " ye " : " you ",
    " thee " : " you ",
    " thou " : " you ",
    " thy " : " you ",
    "thine" : "yours",
    "shall": "will",
    "shalt": "be",
    " hath" : " have",
    " hast" : " have",
    "hateth" : "hate",
    "cometh" : "come",
    "believeth" : "believe",
    "doeth" : "do", 
    "didst" : "do"
    "seeth" : "see",
    " shew" : " show",
    "unto" : "to",
    " men " : " man",
    "whoever" : "who ever",
    "art" : "be",
    "asketh" : "ask",
    "bringeth" : "bring",
    "calleth" : "call",
    "cannot" : "can",
    "canst" : "can",
    "commandmants" : "commandmant",
    "committeth" : "commit",
    "dists" : "do",
    "entereth" : "enter",
    "gathereth" : "gather",
    "goeth" : "go",
    "heareth" : "hear",
    "husbandman" : "husband",
    "knewest" : "know",
    "knowest" : "know",
    "knoweth" : "know",
    "leaven" : "yeast",
    "lest" : "", ### highly disturbing
    "loveth" : "love",
    "nigh" : "close",
    "putteth" : "put",
    "receiveth" : "receive",
    "saith" : "say",
    "sayest" : "say",
    "scattereth" : "scatter",
    "seeketh" : "seek",
    "sleepeth" : "sleep",
    "speaketh" : "speak",
    "taketh" : "take",
    "walketh" : "walk",
    "wheresoever" : "",
    "whatsoever" : "",
    "whence" : "",
    "whose" : "",
    "whoso" : "",
    "whereto" : "",
    "whosoever" : "",
    "yea" : "yes"}  

SyntaxError: ignored

In [0]:
%%time
gospels_cleaned = {}
for gospel in ["matt", "mark", "luke", "john", "thom"]:
  saying_tuples = []
  list_of_words = []  
  for saying_raw in gospels_saying_verses[gospel]:
    saying_string = re.sub(r'[^\w\s]','', saying_raw)
    for word, key in to_replace.items():
      saying_string = saying_string.lower().replace(word, key)
    saying = [word.lower() for word in saying_string.split() if word.lower() not in stopwords.words('english')]
    string_lemmatized_verbs = []
      ### lemmatize verbs (= "pos="v"")
    for word in saying:
      string_lemmatized_verbs.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
      ### lemmatize rest (= substantives from plural to singular)
    saying_lemmatized = []
    for word in string_lemmatized_verbs:
      saying_lemmatized.append(wordnet_lemmatizer.lemmatize(word))
    saying_final = [] ### there still tend to remain some dirty words...
    for element in ["say", "you"]:
      try: saying_lemmatized.remove(element)
      except: pass
    for word in saying_lemmatized:
      if word.lower() =="u":
        word = "you"
      if word.lower() =="men":
        word = "man"
      if (word == "thee") or (word == "thy"):
        word = "you"
      saying_final.append(word)
    saying_tuples.append((saying_string, saying_final)) 
    list_of_words.extend(saying_final)
  gospels_cleaned.update({gospel : {"verses" : saying_tuples, "words": list_of_words}})

CPU times: user 3.85 s, sys: 336 ms, total: 4.18 s
Wall time: 4.19 s


In [0]:
for gospel in ["matt", "mark", "luke", "john", "thom"]:
  for word in [tup[0] for tup in nltk.FreqDist(gospels_cleaned[gospel]["words"]).most_common()]:
    if word in [word.replace(" ", "") for word in to_replace.keys()]:
      print(word)

In [0]:
### to access cleaned lemmata in one list:
print(gospels_cleaned["thom"]["words"][:10])

['ever', 'find', 'interpretation', 'experience', 'death', 'let', 'seek', 'continue', 'seek', 'find']


In [0]:
### to access cleaned lemmata in one list:
print(gospels_cleaned["matt"]["verses"][:5])

[('let it be so now for thus it is fitting for us to fulfil all righteousness', ['let', 'thus', 'fit', 'you', 'fulfil', 'righteousness']), ('it is written man will not live by bread alone but by every word that proceeds from the mouth of god', ['write', 'man', 'live', 'bread', 'alone', 'every', 'word', 'proceed', 'mouth', 'god']), ('again it is written you will not tempt the lord your god', ['write', 'tempt', 'lord', 'god']), ('begone satan for it is written you will worship the lord your god and him only will you serve', ['begone', 'satan', 'write', 'worship', 'lord', 'god', 'serve']), ('repent for the kingdom of heaven is at hand', ['repent', 'kingdom', 'heaven', 'hand'])]


In [0]:
for gospel in ["matt", "mark", "luke", "john", "thom"]:
  print(set(gospels_cleaned[gospel]["words"]))

{'friend', 'full', 'west', 'cleanse', 'yet', 'trespass', 'root', 'alms', 'pit', 'memory', 'samaritan', 'word', 'away', 'drunken', 'midcloset', 'astray', 'sort', 'iniquity', 'write', 'wed', 'accomplish', 'belt', 'trouble', 'gnat', 'shekel', 'robber', 'well', 'alarm', 'fit', 'doctrine', 'fellow', 'tribute', 'prepare', 'life', 'wish', 'teeth', 'glutton', 'cast', 'jew', 'gomorrah', 'beware', 'array', 'flask', 'tree', 'as', 'hades', 'throat', 'foundation', 'snatch', 'prophesy', 'inheritance', 'greatest', 'humble', 'fall', 'twice', 'many', 'treasure', 'transgress', 'goat', 'crow', 'borrow', 'proceed', 'church', 'pronounce', 'dinner', 'uncleanness', 'exercise', 'please', 'preserve', 'cummin', 'multiply', 'consider', 'pbe', 'ark', 'star', 'unshrunk', 'yonder', 'master', 'moses', 'hide', 'build', 'farm', 'salutation', 'proselyte', 'eagle', 'speck', 'make', 'utter', 'remember', 'theft', 'hindrance', 'mill', 'meet', 'black', 'wheat', 'lip', 'ebehquakes', 'eli', 'cloud', 'declare', 'evidence', 'cl

In [0]:
### backup the output
s.put(sciencedata_groupurl + "dirgot_data/gospels_cleaned.json", data=json.dumps(gospels_cleaned))

<Response [204]>

# Cleaning GNT

In [0]:
### ### get list of Jesus sayings GNT gospels(dictionary-like format)
GNT_sayings = json.loads(s.get(sciencedata_groupurl + "dirgot_data/GNT_sayings.json").content)
print(GNT_sayings["matt"][:5])

[['3.15', ['ἀποκρίνομαι', 'Ἰησοῦς', 'λέγω', 'ἀφίημι', 'ἄρτι', 'οὕτω', 'πρέπω', 'εἰμί', 'πληρόω', 'πᾶς', 'δικαιοσύνη', 'τότε', 'ἀφίημι'], 'But Jesus answered him, "Let it be so now; for thus it is fitting for us to fulfil all righteousness." Then he consented.'], ['4.4', ['ἀποκρίνομαι', 'λέγω', 'γράφω', 'ἄρτος', 'μόνος', 'ζάω', 'ἄνθρωπος', 'πᾶς', 'ῥῆμα', 'ἐκπορεύομαι', 'στόμα', 'θεός'], 'But he answered, "It is written, \'Man shall not live by bread alone, but by every word that proceeds from the mouth of God.\'"'], ['4.7', ['φημί', 'Ἰησοῦς', 'πάλιν', 'γράφω', 'ἐκπειράζω', 'κύριος', 'θεός'], 'Jesus said to him, "Again it is written, \'You shall not tempt the Lord your God.\'"'], ['4.10', ['τότε', 'λέγω', 'Ἰησοῦς', 'ὑπάγω', 'Σατανᾶς', 'γράφω', 'κύριος', 'θεός', 'προσκυνέω', 'μόνος', 'λατρεύω'], 'Then Jesus said to him, "Begone, Satan! for it is written, \'You shall worship the Lord your God and him only shall you serve.\'"'], ['4.17', ['τότε', 'ἄρχω', 'Ἰησοῦς', 'κηρύσσω', 'λέγω', 'μετανο

In [0]:
my_list = ['φημί', 'Ἰησοῦς', 'πάλιν', 'γράφω', 'ἐκπειράζω', 'κύριος', 'θεός']
try: 
  index_term = my_list.index("Ἰησοῦς") + 1
  print(my_list[index_term:]) 
except:
  print(my_list)



['πάλιν', 'γράφω', 'ἐκπειράζω', 'κύριος', 'θεός']


In [0]:
GNT_gospels_cleaned = {} 
for gospel in ["matt", "mark", "luke", "john"]:
  verses = []
  words = []
  for verse_data in GNT_sayings[gospel]:
    try: 
      index_term = verse_data[1].index("Ἰησοῦς") + 1
      verse_cleaned = verse_data[1][index_term:]
    except:
      verse_cleaned = verse_data[1]
    for element in ["εἰμί", "λέγω"]:
      try: verse_cleaned.remove(element)
      except: pass
    verses.append(verse_cleaned)
    words.extend(verse_cleaned)
  GNT_gospels_cleaned.update({gospel : (verses, words)})

In [0]:
for gospel in ["matt", "mark", "luke", "john"]:
  print(set(GNT_gospels_cleaned[gospel][1]))

{'ὀφειλέτης', 'παραγγέλλω', 'τελώνιον', 'μύριοι', 'διασαφέω', 'κλητός', 'ἑπτά', 'Σαμαρίτης', 'τελευτάω', 'μισέω', 'βλαστάνω', 'σκληρός', 'σῶμα', 'μαργαρίτης', 'Ἰουδαία', 'ἀντέχω', 'τρίβολος', 'εὐθέως', 'μυστήριον', 'διωγμός', 'κυλλός', 'ἑνδέκατος', 'μισθόω', 'ἀποθήκη', 'ἀστραπή', 'κωλύω', 'πάσχω', 'ψευδομαρτυρία', 'βόθυνος', 'σαρόω', 'νέος', 'ἐξετάζω', 'πρωτοκλισία', 'ἐννέα', 'κῆτος', 'μακάριος', 'ῥαπίζω', 'οὖς', 'πετρώδης', 'κλέπτω', 'παράδοσις', 'ἔθνος', 'Ἰουδαῖος', 'ἀμπελών', 'ἀνοίγω', 'κώνωψ', 'ἀδελφός', 'στατήρ', 'καταράομαι', 'μνημόσυνον', 'ἀναβαίνω', 'ποιέω', 'χρόνος', 'παρακαλέω', 'κοινωνός', 'ψευδοπροφήτης', 'διαθήκη', 'διχοτομέω', 'καμμύω', 'στράτευμα', 'ἐνενήκοντα', 'καρποφορέω', 'προσμένω', 'φεύγω', 'προσεύχομαι', 'λύτρον', 'ἐάω', 'ἄγγελος', 'νεκρός', 'καταλύω', 'κοινόω', 'ἁρπαγή', 'ἀρχιερεύς', 'Σολομών', 'ἀνεκτός', 'κατακλυσμός', 'ἀνάπαυσις', 'σκοτίζω', 'ἀναπληρόω', 'τηρέω', 'φέρω', 'πέμπω', 'βασανιστής', 'φυλή', 'μάρτυς', 'προστίθημι', 'φυλακή', 'πόρνη', 'ἀκολουθέω', 'νήπ

In [0]:
s.put(sciencedata_groupurl + "dirgot_data/GNT_gospels_cleaned.json", data=json.dumps(GNT_gospels_cleaned))

<Response [204]>

# Overview

In [0]:
total_greek = 0
total_english = 0
for gospel in ["matt", "mark", "luke", "john"]:
  GNT_saying_number = len(GNT_gospels_cleaned[gospel][0])
  GNT_word_number = len(GNT_gospels_cleaned[gospel][1])
  total_greek += GNT_word_number 
  RSV_saying_number = len(gospels_cleaned[gospel]["verses"])
  RSV_word_number = len(gospels_cleaned[gospel]["words"])
  total_english += RSV_word_number
  print(GNT_saying_number, GNT_word_number, RSV_saying_number, RSV_word_number, round(RSV_word_number / GNT_word_number, 5))
print(total_greek, total_english, round(total_english/total_greek, 5))

640 5022 609 4696 0.93509
282 2075 281 1950 0.93976
584 4594 580 4490 0.97736
415 2817 415 2644 0.93859
14508 13780 0.94982


In [0]:
len(gospels_cleaned["thom"]["verses"])

241

In [0]:
len(gospels_cleaned["thom"]["words"])

1654

In [0]:
round(np.mean([len(verse[1]) for verse in gospels_cleaned["thom"]["verses"]]), 5)

6.86307

In [0]:
round(np.mean([len(verse[1]) for verse in gospels_cleaned["john"]["verses"]]), 5)

6.37108