In [1]:
import os
import time
import pandas as pd
import numpy as np
import nltk
import pickle
import json
import sys
sys.setrecursionlimit(10000000)
import re ### to use extended regex library: https://pypi.org/project/regex/
import ast
import csv
from bs4 import BeautifulSoup
import requests
import getpass
from urllib.request import urlopen

!pip install --ignore-installed sddk ### our own package under construction, always install to have up-to-date version
import sddk

Collecting sddk
  Downloading https://files.pythonhosted.org/packages/34/30/92ab5e6e5ea4b2b5155fe2da6c95869b5c6a4de4e0c92ee7009cd41cc0cf/sddk-1.6.1-py3-none-any.whl
Installing collected packages: sddk
Successfully installed sddk-1.6.1


In [0]:
import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe

In [0]:
### authorize google sheets
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())
# establish connection with particular sheet by its url:

# RESULTS SPREADSHEET
sh = gc.open_by_url("https://docs.google.com/spreadsheets/d/1v99i42xd9jh0E9O_0-SrCrYE8WI91yQWB03BtJUm97w/edit?usp=sharing")

# sciencedata.dk configuration

As a data storage, we will use Danish official infrastructure called sciencedata.dk. It is commobly approacheble from any European research institution.

The configuration require having already a proper username and password configurated on the web interface of the platform.

In [4]:
conf = sddk.configure_session_and_url("dirgot_data", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): kase@zcu.cz
sciencedata.dk password: ··········
connection with shared folder established with you as its ordinary user
endpoint variable has been configured to: https://sciencedata.dk/sharingout/648597%40au.dk/dirgot_data/


# Coptic GoT data

In [0]:
url = "http://marcion.sourceforge.net/nag-hammadi-library/gospel-of-thomas-en.html#transl"
resp = requests.get(url)

In [0]:
soup = BeautifulSoup(resp.content, "html.parser")

In [0]:
coptic_words = []
for td in soup.find_all("td"):
  if "Athena" in str(td):
    td_text = td.get_text()
    if len(td_text) > 0:
      coptic_words.append(td_text)

In [78]:
coptic_words[:10]

['ⲛⲁⲉⲓ', 'ⲛⲉ', 'ⲛ-', 'ϣⲁϫⲉ', 'ⲉ[ⲧ]-', '[ϩ]ⲏⲡ+', 'ⲉⲛⲧ-', 'ⲁ-', '(gk) ⲓⲥ', 'ⲉⲧ-']

In [0]:
set_with_dataframe(sh.add_worksheet("coptic_words", cols=1, rows=1), pd.DataFrame(coptic_words))

In [0]:
rows = []
for table in soup.find_all("table"):
  table_name = table.find("a").get_text()
  try:
    actual_passage = table.find("big").get_text()
  except:
    actual_passage = ""
  for tr in table.find_all("tr"):
    row = []
    row.extend([table_name, actual_passage])
    for td in tr.find_all("td"):
      row.append(td.get_text())
    rows.append(row)
    

In [114]:
got_coptic_raw = pd.DataFrame(rows[1:])
got_coptic_raw.drop([5, 6, 7], axis=1, inplace=True)
got_coptic_raw.dropna(thresh=4, inplace=True)
got_coptic_raw.columns = ["line", "context", "coptic", "translation", "code"]
got_coptic_raw.head(40)

Unnamed: 0,line,context,coptic,translation,code
1,(32/10)-(32/12),These are the hidden words that the living Jes...,ⲛⲁⲉⲓ,these (pl),3-52.
2,(32/10)-(32/12),These are the hidden words that the living Jes...,ⲛⲉ,are (pl),3-306.
3,(32/10)-(32/12),These are the hidden words that the living Jes...,ⲛ-,(def art pl),3-80.
4,(32/10)-(32/12),These are the hidden words that the living Jes...,ϣⲁϫⲉ,saying,2-3189
5,(32/10)-(32/12),These are the hidden words that the living Jes...,ⲉ[ⲧ]-,which,3-355.
6,(32/10)-(32/12),These are the hidden words that the living Jes...,[ϩ]ⲏⲡ+,be hidden,1-37
7,(32/10)-(32/12),These are the hidden words that the living Jes...,ⲉⲛⲧ-,which,3-360.
8,(32/10)-(32/12),These are the hidden words that the living Jes...,ⲁ-,(perf),3-200a.
9,(32/10)-(32/12),These are the hidden words that the living Jes...,(gk) ⲓⲥ,Iesus,4-ihsous
10,(32/10)-(32/12),These are the hidden words that the living Jes...,ⲉⲧ-,which,3-355.


In [0]:
### look for rows which are probably not Jesus' sayings
def context_identifier(row):
  trans = row["translation"]
  for string in ["say", "Iesus"]:
    if string in trans:
      return "n"
    else:
      return "y"

got_coptic_raw["saying?"] = got_coptic_raw.apply(lambda row: context_identifier(row), axis=1)

In [0]:
### just for checking
got_coptic_raw.head(5)

In [121]:
len(got_coptic_raw["code"].unique().tolist())

621

In [122]:
len(got_coptic_raw["coptic"].unique().tolist())

950

In [0]:
set_with_dataframe(sh.add_worksheet("got_coptic_raw", cols=1, rows=1), pd.DataFrame(got_coptic_raw))

# Start with manually coded data

In [5]:
### we have manually coded the data in googlesheets
### see the variable "saying?"
### "y" - words asigned to Jesus
### "n" - rest of the text, including direct speeches pronounced by others

got_coptic_coded = get_as_dataframe(sh.worksheet("got_coptic_coded"))
got_coptic_coded

Unnamed: 0,line,context,coptic,translation,code,saying?
0,(32/10)-(32/12),These are the hidden words that the living Jes...,ⲛⲁⲉⲓ,these (pl),3-52.,n
1,(32/10)-(32/12),These are the hidden words that the living Jes...,ⲛⲉ,are (pl),3-306.,n
2,(32/10)-(32/12),These are the hidden words that the living Jes...,ⲛ-,(def art pl),3-80.,n
3,(32/10)-(32/12),These are the hidden words that the living Jes...,ϣⲁϫⲉ,saying,2-3189,n
4,(32/10)-(32/12),These are the hidden words that the living Jes...,ⲉ[ⲧ]-,which,3-355.,n
...,...,...,...,...,...,...
5957,(51/27)-(51/28),The Gospel According to Thomas,ⲡ-,(def art m s),3-80.,n
5958,(51/27)-(51/28),The Gospel According to Thomas,(gk) ⲉⲩⲁⲅⲅⲉⲗⲓⲟⲛ,gospel,4-euaggelion,n
5959,(51/27)-(51/28),The Gospel According to Thomas,ⲡ-,(def art m s),3-80.,n
5960,(51/27)-(51/28),The Gospel According to Thomas,(gk) ⲕⲁⲧⲁ,according to,4-kata,n


In [6]:
### write coded data to sciencedata.dk for backup
sddk.write_file("got_coptic_coded.json", got_coptic_coded, conf)

Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/sharingout/648597%40au.dk/dirgot_data/got_coptic_coded.json"


In [7]:
len(got_coptic_coded)

5962

In [77]:
got_coptic_sayings = got_coptic_coded[got_coptic_coded["saying?"]=="y"]
len(got_coptic_sayings)

4896

In [10]:
len(got_coptic_sayings["coptic"].unique().tolist())

882

In [11]:
len(got_coptic_sayings["code"].unique().tolist())

581

In [0]:
codes_and_terms = []
codes_unique = got_coptic_sayings["code"].unique().tolist()
for code in codes_unique:
  if "." in str(code):
    dot = True
  else:
    dot = False
  df_temp = got_coptic_sayings[got_coptic_sayings["code"]==code]
  code_count = len(df_temp)
  terms = df_temp["coptic"].unique().tolist()
  translations = df_temp["translation"].unique().tolist()
  codes_and_terms.append([code, code_count, terms, translations, dot])

In [0]:
codes_and_terms = sorted(codes_and_terms, key=lambda x: x[1], reverse=True)

In [48]:
codes_and_terms_df = pd.DataFrame(codes_and_terms)
codes_and_terms_df.columns = ["code", "count", "coptic", "translation", "dotted"]
codes_and_terms_df.head(10)

Unnamed: 0,code,count,coptic,translation,dotted
0,3-80.,385,"[[ⲧ]-, ⲡ-, ⲧ-, ⲛ-, ⲡⲉ-, ⲧⲉ-, [ⲡ]-, [ⲛ]-, ⲑ-, ⲡ...","[(def art f s), (def art m s), (def art pl)]",True
1,3-35.,347,"[-ϥ, -ⲥ, -ⲧⲛ, -ⲧⲏⲩⲧⲛ, -ⲧⲏⲛⲉ, -ⲕ, -ⲟⲩ, ϥ-, -ⲧ, ...","[(suff pron 3rd s m), (suff pron 3rd s f), (su...",True
2,3-262.,326,"[ⲛ-, ⲙ-, ⲙⲙⲟ=, [ⲛ]-, ⲛ- (?), ⲛⲛ-, ⲙⲙⲱ=, ⲙ- (?)...","[(gen), (ind obj of vb), (loc) in, (adj), (loc...",True
3,3-199a.,279,"[ϥⲛⲁ-, ⲥⲉⲛⲁ-, ⲧⲉⲧⲛⲁ-, ⲉϥⲛⲁ-, ⲉⲩⲛⲁ-, ⲁϥ-, ⲙⲁⲣⲉϥ...","[(fut 3rd s m), (fut 3rd pl), (fut 2st pl), (p...",True
4,3-188.,163,"[ⲉϥ-, ⲉⲩ-, ⲉ[ⲣⲉ]-, ⲥ-, ⲉⲧⲉⲧⲛ-, ⲧⲉⲧⲛ-, ⲥⲉ-, ⲉ[ⲣ...","[(circ 3rd s m), (pres 3rd s m), (pres 3st pl)...",True
5,3-50.,125,"[ⲡⲉⲧⲛ-, ⲛⲉϥ-, ⲡⲉⲕ-, ⲧⲉϥ-, ⲛⲉⲧⲙ-, ⲧⲉⲧⲛ-, ⲛⲁ-, ⲛ...","[(m) your (pl), (pl) his, (s m) your (m), (f) ...",True
6,3-261.,116,"[ⲉ-, ⲉⲣⲟ=, ⲁ-, ⲉⲣⲟ]","[to, for, (dat), against, about, towards]",True
7,1-536,116,"[ⲁⲩⲱ, ⲁⲩ-]",[and],False
8,3-85.,110,"[ⲟⲩ-, ⲩ-, ϩⲛ-, ϩⲉⲛ-, ϩⲙ-]","[(indef art s), (indef art pl)]",True
9,1-23,78,"[ⲣ-, ⲁⲁ=, ⲉⲓⲣⲉ, [ⲁ]ⲁ=, ⲟ+, ⲁ=, ⲉⲣⲓ-, [ⲓ]ⲣⲉ]","[act, become, befall, make, do, do!]",False


In [0]:
set_with_dataframe(sh.add_worksheet("codes_and_terms", cols=1, rows=1), codes_and_terms_df)

In [52]:
### write this dictionary to sciencedata.dk for backup
sddk.write_file("got_codes_and_terms.json", codes_and_terms_df, conf)

Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/sharingout/648597%40au.dk/dirgot_data/got_codes_and_terms.json"


In [50]:
### let's take a look at terms without dots

codes_and_terms_filtered = codes_and_terms_df[codes_and_terms_df["dotted"]==False]
codes_and_terms_filtered

Unnamed: 0,code,count,coptic,translation,dotted
7,1-536,116,"[ⲁⲩⲱ, ⲁⲩ-]",[and],False
9,1-23,78,"[ⲣ-, ⲁⲁ=, ⲉⲓⲣⲉ, [ⲁ]ⲁ=, ⲟ+, ⲁ=, ⲉⲣⲓ-, [ⲓ]ⲣⲉ]","[act, become, befall, make, do, do!]",False
11,1-365,68,[ϫⲉ],"[:, that, because, if, whether]",False
16,2-174,47,"[ⲉⲃⲟⲗ, [ⲉⲃ]ⲟⲗ (?)]","[outward, forth, out]",False
17,1-13,41,"[ϣⲟⲟⲡ+, ϣⲱⲡⲉ]","[be, exist, become]",False
...,...,...,...,...,...
576,1-1388,1,[ⲥⲕⲁⲉⲓ],[plough],False
577,4-arxw,1,[(gk) ⲁⲣⲭⲉⲓ],[begin],False
578,1-2521,1,[ϭⲱⲗ],[roll back],False
579,1-531,1,[ⲡⲟⲣϣ+],[be spread],False


In [0]:
set_with_dataframe(sh.add_worksheet("codes_and_terms_filtered", cols=1, rows=1), codes_and_terms_filtered)

In [0]:
### write coded data to sciencedata.dk for backup
sddk.write_file("got_codes_and_terms.json", codes_and_terms_df, conf)

# Rearranging data structure

In [78]:
def check_dot(row):
  try:
    if "." in row["code"]:
      return True
    else:
      return False
  except:
    return True
got_coptic_sayings["dotted"] = got_coptic_sayings.apply(lambda row: check_dot(row), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [79]:
got_coptic_sayings = got_coptic_sayings[got_coptic_sayings["dotted"]==False]
got_coptic_sayings.drop(["saying?", "dotted"], axis=1, inplace=True)
got_coptic_sayings.head(10)

Unnamed: 0,line,context,coptic,translation,code
27,(32/12)-(32/14),"(1) And he said: ""Whoever finds the meaning of...",ϩⲉ,find,1-41
30,(32/12)-(32/14),"(1) And he said: ""Whoever finds the meaning of...",(gk) [ϩ]ⲉⲣⲙⲏⲛⲉⲓⲁ,explanation,4-ermhneia
35,(32/12)-(32/14),"(1) And he said: ""Whoever finds the meaning of...",ϫⲓ-,take,1-131
36,(32/12)-(32/14),"(1) And he said: ""Whoever finds the meaning of...",ϯⲡⲉ,taste,2-2305
40,(32/12)-(32/14),"(1) And he said: ""Whoever finds the meaning of...",ⲙⲟⲩ,death,2-747
45,(32/14)-(32/16),"(2) Jesus says: ""The one who seeks should not ...",ⲗⲟ,"cease, stop",1-43
46,(32/14)-(32/16),"(2) Jesus says: ""The one who seeks should not ...",ⲛϭⲓ,(nom),1-1234
48,(32/14)-(32/16),"(2) Jesus says: ""The one who seeks should not ...",ϣⲓⲛⲉ,seek,1-46
50,(32/14)-(32/16),"(2) Jesus says: ""The one who seeks should not ...",ϣⲓⲛⲉ,ask,1-46
52,(32/14)-(32/16),"(2) Jesus says: ""The one who seeks should not ...",ϭⲓⲛⲉ,find,1-47


In [84]:
got_lines_data = [] 
for line in got_coptic_sayings["line"].unique().tolist():
  line_df = got_coptic_sayings[got_coptic_sayings["line"]==line] 
  line_codes = line_df["code"].tolist()
  line_eng_terms = line_df["translation"].tolist()
  line_eng_context = line_df["context"].tolist()[0]
  got_lines_data.append([line, line_codes, line_eng_terms, line_eng_context])
got_lines_data[:5]

[['(32/12)-(32/14)',
  ['1-41', '4-ermhneia', '1-131', '2-2305', '2-747'],
  ['find', 'explanation', 'take', 'taste', 'death'],
  '(1) And he said: "Whoever finds the meaning of these words will not taste death."'],
 ['(32/14)-(32/16)',
  ['1-43',
   '1-1234',
   '1-46',
   '1-46',
   '1-47',
   '1-536',
   '4-otan',
   '1-674',
   '1-47'],
  ['cease, stop',
   '(nom)',
   'seek',
   'ask',
   'find',
   'and',
   'when',
   'if, when',
   'find'],
  '(2) Jesus says: "The one who seeks should not cease seeking until he finds. And when he finds,'],
 ['(32/16)-(32/19)',
  ['1-45',
   '1-536',
   '1-674',
   '1-45',
   '1-23',
   '1-44',
   '1-536',
   '1-23',
   '1-18',
   '1-48'],
  ['be disturbed, troubled',
   'and',
   'if, when',
   'be disturbed, troubled',
   'act',
   'amazement',
   'and',
   'act',
   'king',
   'all'],
  'he will be dismayed. And when he is dismayed, he will be astonished. And he will be king over the All."'],
 ['(32/19)-(32/23)',
  ['1-674',
   '1-1234',
   '

In [86]:
got_lines_data_df = pd.DataFrame(got_lines_data)
got_lines_data_df.columns = ["line", "codes", "eng_terms", "eng_sentence"]
got_lines_data_df.head(10)

Unnamed: 0,line,codes,eng_terms,eng_sentence
0,(32/12)-(32/14),"[1-41, 4-ermhneia, 1-131, 2-2305, 2-747]","[find, explanation, take, taste, death]","(1) And he said: ""Whoever finds the meaning of..."
1,(32/14)-(32/16),"[1-43, 1-1234, 1-46, 1-46, 1-47, 1-536, 4-otan...","[cease, stop, (nom), seek, ask, find, and, whe...","(2) Jesus says: ""The one who seeks should not ..."
2,(32/16)-(32/19),"[1-45, 1-536, 1-674, 1-45, 1-23, 1-44, 1-536, ...","[be disturbed, troubled, and, if, when, be dis...","he will be dismayed. And when he is dismayed, ..."
3,(32/19)-(32/23),"[1-674, 1-1234, 1-49, 1-365, 2-407, 1-18, 1-50...","[if, when, (nom), draw, :, behold, king, heave...","(03) Jesus says: ""If those who lead you say to..."
4,(32/23)-(32/24),"[1-674, 1-365, 4-qalassa, 1-691, 1-52, 1-23, 2...","[if, when, :, sea, then, fish, become, befall,...","If they say to you: ‘It is in the sea,’ then t..."
5,(32/25)-(32/26),"[4-alla, 1-18, 1-53, 1-536, 2-163]","[but, king, inward part, and, the outside]","Rather, the kingdom is inside of you, and outs..."
6,(32/26)-(33/1),"[4-otan, 1-674, 1-55, 4-tote, 1-55]","[when, if, when, know, at times, know]","""When you come to know yourselves, then you wi..."
7,(33/1)-(33/2),"[1-536, 1-56, 1-365, 1-20, 1-57, 1-38]","[and, know, that, child, father, live]",and you will realize that you are the children...
8,(33/2)-(33/5),"[2-3050, 4-de, 1-55, 1-691, 1-13, 2-3379, 1-53...","[if, however, know, then, be, exist, poor, and...","But if you do not come to know yourselves, the..."
9,(33/5)-(33/6),"[1-59, 1-1234, 1-21, 1-63, 1-64]","[delay, (nom), human, old, day]","(04) Jesus says: ""The person old in his days"


In [87]:
### write out for subsequent analyses
sddk.write_file("got_lines_data.json", got_lines_data_df, conf)

Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/sharingout/648597%40au.dk/dirgot_data/got_lines_data.json"


# Overview

In [0]:
got_lines_data = sddk.read_file("got_lines_data.json", "df", conf)

In [11]:
len(got_lines_data)

273

In [6]:
one_list = []
for line in got_lines_data["codes"].tolist():
  one_list.extend(line)

['1-41',
 '4-ermhneia',
 '1-131',
 '2-2305',
 '2-747',
 '1-43',
 '1-1234',
 '1-46',
 '1-46',
 '1-47',
 '1-536',
 '4-otan',
 '1-674',
 '1-47',
 '1-45',
 '1-536',
 '1-674',
 '1-45',
 '1-23',
 '1-44',
 '1-536',
 '1-23',
 '1-18',
 '1-48',
 '1-674',
 '1-1234',
 '1-49',
 '1-365',
 '2-407',
 '1-18',
 '1-50',
 '1-691',
 '1-51',
 '1-23',
 '2-3083',
 '1-50',
 '1-674',
 '1-365',
 '4-qalassa',
 '1-691',
 '1-52',
 '1-23',
 '2-3083',
 '4-alla',
 '1-18',
 '1-53',
 '1-536',
 '2-163',
 '4-otan',
 '1-674',
 '1-55',
 '4-tote',
 '1-55',
 '1-536',
 '1-56',
 '1-365',
 '1-20',
 '1-57',
 '1-38',
 '2-3050',
 '4-de',
 '1-55',
 '1-691',
 '1-13',
 '2-3379',
 '1-536',
 '2-3379',
 '1-59',
 '1-1234',
 '1-21',
 '1-63',
 '1-64',
 '1-60',
 '1-61',
 '1-20',
 '1-62',
 '1-65',
 '1-64',
 '4-topos',
 '2-2796',
 '1-536',
 '1-38',
 '1-365',
 '1-66',
 '2-3083',
 '1-23',
 '1-67',
 '1-536',
 '1-13',
 '1-68',
 '1-69',
 '1-55',
 '1-70',
 '1-71',
 '2-174',
 '1-536',
 '1-37',
 '1-27',
 '2-174',
 '1-72',
 '4-gar',
 '1-37',
 '1-73',
 

In [0]:
stopwords_coptic = ["1-536", "1-365", "2-174", "1-674", "2-8722", "1-1234", "2-3472", "1-139", "4-de", "4-otan", "2-3485", "4-gar", "4-alla", "4-ina", "4-oude", "2-2335"]

In [8]:
len(one_list)

2203

In [10]:
len([code for code in one_list if code not in stopwords_coptic])

1751

In [13]:
np.round((1654 /  1751) * 100, 4)

94.4603

In [14]:
1751 / 273

6.413919413919414