In [1]:
### Prerequisities packages

import os ## to work with files
import pandas as pd ## dataframe tables
import numpy as np ## math
import plotly.plotly as py ## plots ["plotly" has to be installed, ideally with pip: https://plot.ly/python/getting-started/]
import plotly.graph_objs as go
import nltk ## natural language processing; for some functions, you have firstly download the data by nltk.download() function
import nltk.corpus  
from nltk.text import Text ## to transform text into a right format

In [2]:
### uploading the data from the same folder

GNT_df = pd.read_csv("../data_input/Nestle1904.csv", delim_whitespace=True, index_col=False, skiprows=1, names=("book", "ch_v", "func_morph", "pos1", "pos2", "strong", "lemma", "normalized"))
GNT_df

Unnamed: 0,book,ch_v,func_morph,pos1,pos2,strong,lemma,normalized
0,Matt,1:1,Βίβλος,N-NSF,N-NSF,976,βίβλος,Βίβλος
1,Matt,1:1,γενέσεως,N-GSF,N-GSF,1078,γένεσις,γενέσεως
2,Matt,1:1,Ἰησοῦ,N-GSM,N-GSM,2424,Ἰησοῦς,Ἰησοῦ
3,Matt,1:1,Χριστοῦ,N-GSM,N-GSM,5547,Χριστός,Χριστοῦ
4,Matt,1:1,υἱοῦ,N-GSM,N-GSM,5207,υἱός,υἱοῦ
5,Matt,1:1,Δαυεὶδ,N-PRI,N-PRI,1138,Δαυίδ,Δαυείδ
6,Matt,1:1,υἱοῦ,N-GSM,N-GSM,5207,υἱός,υἱοῦ
7,Matt,1:1,Ἀβραάμ.,N-PRI,N-PRI,11,Ἀβραάμ,Ἀβραάμ
8,Matt,1:2,Ἀβραὰμ,N-PRI,N-PRI,11,Ἀβραάμ,Ἀβραάμ
9,Matt,1:2,ἐγέννησεν,V-AAI-3S,V-AAI-3S,1080&5656,γεννάω,ἐγέννησεν


In [3]:
### A short demonstration of the data structure on Matt 1:18

pd.concat([GNT_df[283:301]["func_morph"], GNT_df[275:301]["lemma"], GNT_df[275:301]["pos1"]], axis=1)

Unnamed: 0,func_morph,lemma,pos1
275,,ὁ,T-GSM
276,,δέ,CONJ
277,,Ἰησοῦς,N-GSM
278,,Χριστός,N-GSM
279,,ὁ,T-NSF
280,,γένεσις,N-NSF
281,,οὕτω,ADV
282,,εἰμί,V-IAI-3S
283,μνηστευθείσης,μνηστεύω,V-APP-GSF
284,τῆς,ὁ,T-GSF


In [5]:
## one verse as a normal text

" ".join(GNT_df[275:317]["func_morph"].tolist())

### the original: Τοῦ δὲ Ἰησοῦ Χριστοῦ ἡ γένεσις οὕτως ἦν. μνηστευθείσης τῆς μητρὸς αὐτοῦ Μαρίας τῷ Ἰωσήφ, πρὶν ἢ συνελθεῖν αὐτοὺς εὑρέθη ἐν γαστρὶ ἔχουσα ἐκ Πνεύματος Ἁγίου. Ἰωσὴφ δὲ ὁ ἀνὴρ αὐτῆς, δίκαιος ὢν καὶ μὴ θέλων αὐτὴν δειγματίσαι, ἐβουλήθη λάθρᾳ ἀπολῦσαι αὐτήν.'

### the reducedt text has the following form in this case:
### ἰησοῦς χριστός γένεσις οὕτω εἰμί μνηστεύω μήτηρ μαρία ἰωσήφ πρίν συνέρχομαι εὑρίσκω ἐν γαστήρ ἔχω ἐκ πνεῦμα ἅγιος ἰωσήφ ἀνήρ δίκαιος εἰμί μή θέλω δειγματίζω βούλομαι λάθρᾳ ἀπολύω

'Τοῦ δὲ Ἰησοῦ Χριστοῦ ἡ γένεσις οὕτως ἦν. μνηστευθείσης τῆς μητρὸς αὐτοῦ Μαρίας τῷ Ἰωσήφ, πρὶν ἢ συνελθεῖν αὐτοὺς εὑρέθη ἐν γαστρὶ ἔχουσα ἐκ Πνεύματος Ἁγίου. Ἰωσὴφ δὲ ὁ ἀνὴρ αὐτῆς, δίκαιος ὢν καὶ μὴ θέλων αὐτὴν δειγματίσαι, ἐβουλήθη λάθρᾳ ἀπολῦσαι αὐτήν.'

In [6]:
# A list of unique word forms

# open question: how to approach pronouns - the of "we" is here "ego"
#  GNT_df[GNT_df.pos1.str.startswith("P-1")]

GNT_df.pos1.unique().tolist()

['N-NSF',
 'N-GSF',
 'N-GSM',
 'N-PRI',
 'V-AAI-3S',
 'T-ASM',
 'CONJ',
 'N-ASM',
 'T-APM',
 'N-APM',
 'P-GSM',
 'N-NSM',
 'PREP',
 'T-GSF',
 'T-GSM',
 'T-ASF',
 'N-ASF',
 'R-GSF',
 'V-API-3S',
 'T-NSM',
 'V-PPP-NSM',
 'A-NPF',
 'T-NPF',
 'N-NPF',
 'ADV',
 'T-NSF',
 'V-IAI-3S',
 'V-APP-GSF',
 'T-DSM',
 'PRT',
 'V-2AAN',
 'P-APM',
 'N-DSF',
 'V-PAP-NSF',
 'N-GSN',
 'A-GSN',
 'P-GSF',
 'A-NSM',
 'V-PAP-NSM',
 'PRT-N',
 'P-ASF',
 'V-AAN',
 'V-AOI-3S',
 'D-APN',
 'V-AOP-GSM',
 'V-2AMM-2S',
 'N-OI',
 'V-2API-3S',
 'P-DSM',
 'V-AOS-2S',
 'P-2GS',
 'T-NSN',
 'P-DSF',
 'V-APP-NSN',
 'V-PAI-3S',
 'V-FDI-3S',
 'V-FAI-2S',
 'T-ASN',
 'N-ASN',
 'P-NSM',
 'V-FAI-3S',
 'T-GPF',
 'N-GPF',
 'P-GPM',
 'D-NSN',
 'A-NSN',
 'V-2RAI-3S',
 'V-APS-3S',
 'V-PAP-GSM',
 'V-FAI-3P',
 'R-NSN',
 'V-PPP-NSN',
 'P-1GP',
 'V-APP-NSM',
 'V-2AAI-3S',
 'R-GSM',
 'V-APP-GSM',
 'N-DPF',
 'N-NPM',
 'V-2ADI-3P',
 'N-APN',
 'V-PAP-NPM',
 'ADV-I',
 'T-GPM',
 'A-GPM',
 'V-2AAI-1P',
 'T-DSF',
 'V-AAP-NSM',
 'A-NSF',
 'V-2AAP-NS

In [7]:
## The text as a list of lemmata

GNT_lemmata = GNT_df["lemma"].tolist()

In [8]:
# selecting the relevant words in the dataframe

# this version does not include pronouns ("P-1", "P-2")
GNT_reduced = GNT_df[GNT_df["pos1"].str.startswith(("N-","V-","A-","ADV", "PRT-N", "PREP"))]


# Make a lowercase list from the sellected lemmata

GNT_working = GNT_reduced["lemma"].tolist()

# Make a a string
GNT_working = " ".join(GNT_working)
GNT_working = GNT_working.lower()
GNT_working


'βίβλος γένεσις ἰησοῦς χριστός υἱός δαυίδ υἱός ἀβραάμ ἀβραάμ γεννάω ἰσαάκ ἰσαάκ γεννάω ἰακώβ ἰακώβ γεννάω ἰούδας ἀδελφός ἰούδας γεννάω φάρες ζάρα ἐκ θαμάρ φάρες γεννάω ἑσρώμ ἑσρώμ γεννάω ἀράμ ἀράμ γεννάω ἀμιναδάβ ἀμιναδάβ γεννάω ναασσών ναασσών γεννάω σαλμών σαλμών γεννάω βόες ἐκ ῥαχάβ βόες γεννάω ἰωβήδ ἐκ ῥούθ ἰωβήδ γεννάω ἰεσσαί ἰεσσαί γεννάω δαυίδ βασιλεύς δαυίδ γεννάω σολομών ἐκ οὐρίας σολομών γεννάω ῥοβοάμ ῥοβοάμ γεννάω ἀβιά ἀβιά γεννάω ἀσάφ ἀσάφ γεννάω ἰωσαφάτ ἰωσαφάτ γεννάω ἰωράμ ἰωράμ γεννάω ὀζίας ὀζίας γεννάω ἰωαθάμ ἰωαθάμ γεννάω ἀχάζ ἀχάζ γεννάω ἑζεκίας ἑζεκίας γεννάω μανασσῆ μανασσῆ γεννάω ἀμώς ἀμώς γεννάω ἰωσίας ἰωσίας γεννάω ἰεχονίας ἀδελφός ἐπί μετοικεσία βαβυλών μετά μετοικεσία βαβυλών ἰεχονίας γεννάω σαλαθιήλ σαλαθιήλ γεννάω ζοροβαβέλ ζοροβαβέλ γεννάω ἀβιούδ ἀβιούδ γεννάω ἐλιακίμ ἐλιακίμ γεννάω ἀζώρ ἀζώρ γεννάω σαδώκ σαδώκ γεννάω ἀχίμ ἀχίμ γεννάω ἐλιούδ ἐλιούδ γεννάω ἐλεάζαρ ἐλεάζαρ γεννάω ματθάν ματθάν γεννάω ἰακώβ ἰακώβ γεννάω ἰωσήφ ἀνήρ μαρία ἐκ γεννάω ἰησοῦς λέγω χρ

In [10]:
## The New Testament books

GNT_df.book.unique().tolist()

['Matt',
 'Mark',
 'Luke',
 'John',
 'Acts',
 'Rom',
 '1Cor',
 '2Cor',
 'Gal',
 'Eph',
 'Phil',
 'Col',
 '1Thess',
 '2Thess',
 '1Tim',
 '2Tim',
 'Titus',
 'Phlm',
 'Heb',
 'Jas',
 '1Pet',
 '2Pet',
 '1John',
 '2John',
 '3John',
 'Jude',
 'Rev']

In [11]:
### to group the texts according to their datings:

GNT_50 = " ".join(GNT_reduced[GNT_reduced["book"].str.startswith(('1Thess', 'Phlm', 'Rom', '1Cor', '2Cor', 'Gal'))]["lemma"].tolist())
GNT_60 = " ".join(GNT_reduced[GNT_reduced["book"].str.startswith(('Phil', 'Jas', 'Heb', 'Mark'))]["lemma"].tolist())
GNT_70 = " ".join(GNT_reduced[GNT_reduced["book"].str.startswith(('Matt'))]["lemma"].tolist())
GNT_80 = " ".join(GNT_reduced[GNT_reduced["book"].str.startswith(('Col', 'Luke', '2Thess', '2Pet'))]["lemma"].tolist())
GNT_90 = " ".join(GNT_reduced[GNT_reduced["book"].str.startswith(('Acts', 'Eph', 'Jude', 'Rev', 'John', '1Pet'))]["lemma"].tolist())
GNT_100 = " ".join(GNT_reduced[GNT_reduced["book"].str.startswith(('1John'))]["lemma"].tolist())
GNT_110 = " ".join(GNT_reduced[GNT_reduced["book"].str.startswith(('1Tim', '2Tim', '2John', '3John', 'Titus'))]["lemma"].tolist())

GNT_dated = [GNT_50, GNT_60, GNT_70, GNT_80, GNT_90, GNT_100, GNT_110]

In [12]:
### Replacement function to be used on our subsets

to_replace = {"πνεῦμα ἅγιος": "πνεῦμα_ἅγιον", "ἅγιος πνεῦμα": "πνεῦμα_ἅγιον"}
def replace_all(text, to_replace):
    for i, j in to_replace.items():
        text = text.replace(i, j)
    return text

In [17]:
## Generating subset data - the texts sorted by their dating

GNT_subsets_counts = []  ### a list to be filled by the data (sub-lists)
for GNT_subset in GNT_dated:
    GNT_subset = replace_all(GNT_subset, to_replace).split()
    ### add raw to the table with all the staff
    GNT_subsets_counts.append([len(GNT_subset), GNT_subset.count("πνεῦμα") + GNT_subset.count("πνεῦμα_ἅγιον"), ((GNT_subset.count("πνεῦμα") + GNT_subset.count("πνεῦμα_ἅγιον")) / float(len(GNT_subset))), GNT_subset.count("πνεῦμα_ἅγιον"), (GNT_subset.count("πνεῦμα_ἅγιον") / float(len(GNT_subset)))])
GNT_subsets_counts

GNT_subsets_counts_df = pd.DataFrame(GNT_subsets_counts, columns=["subset length", "FD πνεῦμα", "NormFD πνεῦμα", "FD πν_ἅγ", "NormFD πν_ἅγ"], index=["GNT_50", "GNT_60", "GNT_70", "GNT_80", "GNT_90", "GNT_100", "GNT_110"])
GNT_subsets_counts_df = GNT_subsets_counts_df.round(5)
GNT_subsets_counts_df

Unnamed: 0,subset length,FD πνεῦμα,NormFD πνεῦμα,FD πν_ἅγ,NormFD πν_ἅγ
GNT_50,13662,115,0.00842,13,0.00095
GNT_60,11856,42,0.00354,9,0.00076
GNT_70,10826,19,0.00176,4,0.00037
GNT_80,13664,42,0.00307,13,0.00095
GNT_90,28858,142,0.00492,46,0.00159
GNT_100,1198,12,0.01002,0,0.0
GNT_110,2665,7,0.00263,2,0.00075


In [19]:
## drawing the table

trace = go.Table(
    columnwidth = [180,180,180,180,180,180],
    header=dict(values=["subset", "subset length", "FD πνεῦμα", "NormFD πνεῦμα", "FD πν_ἅγ","NormFD πν_ἅγ"],
                fill = dict(color='#C2D4FF'),
                align = ['left'] * 3),
    cells=dict(values=[GNT_subsets_counts_df.index, GNT_subsets_counts_df["subset length"], GNT_subsets_counts_df["FD πνεῦμα"], GNT_subsets_counts_df["NormFD πνεῦμα"], GNT_subsets_counts_df["FD πν_ἅγ"], GNT_subsets_counts_df["NormFD πν_ἅγ"]],
               fill = dict(color='#F5F8FF'),
               align = ['left'] * 3))
layout = dict(width=550, height=650)
data = [trace]
fig = dict(data=data, layout=layout)
py.iplot(fig, filename = 'pandas_table')

In [25]:
### drawing a graph
### two axes: https://plot.ly/python/multiple-axes/
### bar and scatter at the same time https://plot.ly/python/graphing-multiple-chart-types/


import plotly.plotly as py
import plotly.graph_objs as go

trace1 = go.Bar(
    x=GNT_subsets_counts_df.index,
    y=GNT_subsets_counts_df["subset length"],
    name= "Subset wordcount",
    marker=dict(
        color='rgb(204,204,204)',
    )
)
trace2 = go.Scatter(
    x=GNT_subsets_counts_df.index,
    y=GNT_subsets_counts_df["FD πνεῦμα"],
    name = "Frequency of the term πνεῦμα",
    yaxis="y2"
)
trace3 = go.Scatter(
    x=GNT_subsets_counts_df.index,
    y=GNT_subsets_counts_df["FD πν_ἅγ"],
    name = "Frequency of the term πνεῦμα_ἅγιον",
    yaxis="y2"
)

data = [trace1, trace2, trace3]
layout = go.Layout(
    title='Total usage per decade',
    yaxis=dict(
        title='subset length',
        range=[0, 30000]
    ),
    yaxis2=dict(
        title='term frequency',
        range=[0, 300],
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
        overlaying='y',
        side='right'
    ),
    legend=dict(
        x=0.2,
        y=1,
        traceorder='reversed',
        font=dict(
            family='sans-serif',
            size=12,
            color='#000'
        ),
    )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='bar-line')

In [23]:
### drawing a graph
### two axes: https://plot.ly/python/multiple-axes/
### bar and scatter at the same time https://plot.ly/python/graphing-multiple-chart-types/


import plotly.plotly as py
import plotly.graph_objs as go


trace0 = go.Scatter(
    x=GNT_subsets_counts_df.index,
    y=GNT_subsets_counts_df["NormFD πνεῦμα"],
    name = "Normalized frequency of the term πνεῦμα"
)
trace1 = go.Scatter(
    x=GNT_subsets_counts_df.index,
    y=GNT_subsets_counts_df["NormFD πν_ἅγ"],
    name = "Normalized frequency of the term πνεῦμα_ἅγιον",

)

data = [trace0, trace1]
layout = go.Layout(
    title='Total usage per decade (normalized)',
    legend=dict(
        x=0.2,
        y=1,
        traceorder='reversed',
        font=dict(
            family='sans-serif',
            size=12,
            color='#000'
            ),
        )
    )


fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='bar-line')

In [26]:
### drawing a graph
### two axes: https://plot.ly/python/multiple-axes/
### bar and scatter at the same time https://plot.ly/python/graphing-multiple-chart-types/


import plotly.plotly as py
import plotly.graph_objs as go


trace0 = go.Scatter(
    x=GNT_subsets_counts_df.index,
    y=GNT_subsets_counts_df["NormFD πν_ἅγ"],
    name = "Normalized frequency of the term πνεῦμα"
)

data = [trace0]
layout = go.Layout(
    title='Total usage per decade (normalized)',
    legend=dict(
        x=0.2,
        y=1,
        traceorder='reversed',
        font=dict(
            family='sans-serif',
            size=12,
            color='#000'
            ),
        )
    )


fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='bar-line')

In [27]:
# Replacement function: a set of word pairs defined in the dictionary "to_replace"

to_replace = {"πνεῦμα ἅγιος": "πνεῦμα_ἅγιον", "ἅγιος πνεῦμα": "πνεῦμα_ἅγιον"}
def replace_all(text, to_replace):
    for i, j in to_replace.items():
        text = text.replace(i, j)
    return text

# New output data with the replaced terms
GNT_repl = replace_all(GNT_working, to_replace).split()

In [28]:
# Change the format: from str to nltk.text
 
GNT_repl = Text(GNT_repl)

In [29]:
# The normal concordance produces stdout only
# To put the output into a new variable we need a modified version of it 

def concordance(ci, word, width=80, lines=500):
    """
    Rewrite of nltk.text.ConcordanceIndex.print_concordance that returns results
    instead of printing them. 

    See:
    https://stackoverflow.com/questions/47649987/how-to-save-nltk-concordance-results-in-a-list
    
    """
    half_width = (width - len(word) - 2) // 2
    context = width // 4 # approx number of words of context

    results = []
    offsets = ci.offsets(word)
    if offsets:
        lines = min(lines, len(offsets))
        for i in offsets:
            if lines <= 0:
                break
            left = (' ' * half_width +
                    ' '.join(ci._tokens[i-context:i]))
            right = ' '.join(ci._tokens[i+1:i+context])
            left = left[-half_width:]
            right = right[:half_width]
            results.append('%s %s %s' % (left, ci._tokens[i], right))
            lines -= 1

    return results

In [30]:
### CONCORDANCE HOLY SPIRIT ("πνεῦμα ἅγιος and ἅγιος πνεῦμα)

from  nltk.text import ConcordanceIndex
ci = ConcordanceIndex(GNT_repl.tokens)
results = concordance(ci, 'πνεῦμα_ἅγιον')

concordance_holy_spirit = results

### The obtained concordance list is based on characters (150 characters window)
### As we are interested in th whole words only, we modify each line (i.e. concordance) into a list of terms
### and cut off the first and last words, which are often incomplete

conc_whole_words = []
for line in concordance_holy_spirit:
    conc_whole_words.append(" ".join(line.split()[1:-1]))
print(conc_whole_words)



['εὑρίσκω ἐν γαστήρ ἔχω ἐκ πνεῦμα_ἅγιον ἰωσήφ ἀνήρ δίκαιος εἰμί μή θέλω', 'ὑπόδημα βαστάζω βαπτίζω ἐν πνεῦμα_ἅγιον πῦρ πτύον ἐν χείρ διακαθαρίζω', 'υἱός ἄνθρωπος ἀφίημι λέγω κατά πνεῦμα_ἅγιον οὐ ἀφίημι ἐν αἰών ἐν μέλλω', 'βαπτίζω εἰς ὄνομα πατήρ υἱός πνεῦμα_ἅγιον διδάσκω τηρέω πᾶς ἐντέλλομαι', 'ὑπόδημα βαπτίζω ὕδωρ βαπτίζω πνεῦμα_ἅγιον γίνομαι ἐν ἡμέρα ἔρχομαι ἰησοῦς', 'βλασφημέω βλασφημέω εἰς πνεῦμα_ἅγιον οὐ ἔχω ἄφεσις εἰς αἰών ἔνοχος', 'υἱός δαυίδ εἰμί δαυίδ λέγω ἐν πνεῦμα_ἅγιον λέγω κύριος κύριος κάθημαι ἐκ', 'ἐν ὥρα λαλέω οὐ εἰμί λαλέω πνεῦμα_ἅγιον παραδίδωμι ἀδελφός ἀδελφός εἰς', 'κύριος οἶνος σίκερα οὐ μή πίνω πνεῦμα_ἅγιον πίμπλημι ἔτι ἐκ κοιλία μήτηρ', 'ἀποκρίνομαι ἄγγελος λέγω πνεῦμα_ἅγιον ἐπέρχομαι ἐπί δύναμις ὕψιστος', 'βρέφος ἐν κοιλία πίμπλημι πνεῦμα_ἅγιον ἐλισάβετ ἀναφωνέω κραυγή μέγας', 'μετά ζαχαρίας πατήρ πίμπλημι πνεῦμα_ἅγιον προφητεύω λέγω εὐλογητός κύριος', 'ἅγιος ἐπί εἰμί χρηματίζω ὑπό πνεῦμα_ἅγιον μή ὁράω θάνατος πρίν ὁράω', 'λύω ἱμάς ὑπόδημα βαπτίζω ἐν πνεῦμα_ἅγιο

In [31]:
### good to know how many words remain in each line (excluding πνεῦμα_ἅγιον)

words_in_conc = []
for line in conc_whole_words:
    words_in_conc.append((len(line.split()) -1))
words_in_conc_df = pd.DataFrame(words_in_conc)

### desriptive stat is enough for us at this point
words_in_conc_df.describe()

Unnamed: 0,0
count,87.0
mean,9.172414
std,1.212406
min,6.0
25%,8.5
50%,9.0
75%,10.0
max,12.0


In [32]:
### CONCORDANCE NORMAL SPIRIT
### Alternatively, we can work with all instances of pneuma: GNT_working = Text(GNT_working.split())

ci = ConcordanceIndex(GNT_repl.tokens)
results = concordance(ci, 'πνεῦμα')

concordance_pneuma = results

### The obtained concordance list is based on characters (150 characters window)
### As we are interested in th whole words only, we modify each line (i.e. concordance) into a list of terms
### and cut off the first and last words, which are often incomplete

conc_pneuma_whole_words = []
for line in concordance_pneuma:
    conc_pneuma_whole_words.append(" ".join(line.split()[1:-1]))
print(conc_pneuma_whole_words)


['μαρία γυνή ἐν γεννάω ἐκ πνεῦμα εἰμί ἅγιος τίκτω υἱός καλέω ὄνομα', 'ἀπό ὕδωρ ὁράω ἀνοίγω οὐρανός ὁράω πνεῦμα θεός καταβαίνω ὡσεί περιστερά', 'τότε ἰησοῦς ἀνάγω εἰς ἔρημος ὑπό πνεῦμα πειράζω ὑπό διάβολος νηστεύω ἡμέρα', 'στόμα διδάσκω λέγω μακάριος πτωχός πνεῦμα εἰμί βασιλεία οὐρανός μακάριος', 'δαιμονίζομαι πολύς ἐκβάλλω πνεῦμα λόγος πᾶς κακῶς ἔχω θεραπεύω ὅπως', 'δώδεκα μαθητής δίδωμι ἐξουσία πνεῦμα ἀκάθαρτος ἐκβάλλω θεραπεύω πᾶς', 'δίδωμι ἐν ὥρα λαλέω οὐ εἰμί λαλέω πνεῦμα πατήρ λαλέω ἐν παραδίδωμι ἀδελφός', 'ἀγαπητός εὐδοκέω ψυχή τίθημι πνεῦμα ἐπί κρίσις ἔθνος ἀπαγγέλλω οὐ', 'υἱός ἐν ἐκβάλλω διά κριτής εἰμί ἐν πνεῦμα θεός ἐκβάλλω δαιμόνιον φθάνω ἐπί', 'ἁμαρτία βλασφημία ἀφίημι ἄνθρωπος πνεῦμα βλασφημία οὐ ἀφίημι λέγω λόγος', 'ὁράω πολύς σολομών ὧδε ἀκάθαρτος πνεῦμα ἐξέρχομαι ἀπό ἄνθρωπος διέρχομαι', 'παραλαμβάνω μετά ἑπτά ἕτερος πνεῦμα πονηρός εἰσέρχομαι κατοικέω ἐκεῖ', 'εἰμί λέγω δαυίδ λέγω πῶς δαυίδ ἐν πνεῦμα καλέω κύριος λέγω λέγω κύριος', 'μή εἰσέρχομαι εἰς πειρασμός πνεῦμα πρό

In [33]:
## Decriptive stat of the concordance

### number of words in lines (good to know how the concordance is)
words_in_conc = []
for line in conc_pneuma_whole_words:
    words_in_conc.append((len(line.split()) -1))
words_in_conc_df = pd.DataFrame(words_in_conc)

### desriptive stat summary is enough for us at this point
words_in_conc_df.describe()

Unnamed: 0,0
count,292.0
mean,10.041096
std,1.408744
min,6.0
25%,9.0
50%,10.0
75%,11.0
max,14.0


In [34]:
# Now we can focus on the concordances as lists of words

conc_holy_spirit_list = [word for line in conc_whole_words for word in line.split()]
# At the same time, we can remove πνεῦμα_ἅγιον from our lists
conc_holy_spirit_list = list(filter(lambda x: x!= "πνεῦμα_ἅγιον", conc_holy_spirit_list))


conc_normal_spirit_list = [word for line in conc_pneuma_whole_words for word in line.split()]
# At the same time, we can remove πνεῦμα_ἅγιον from our lists
conc_normal_spirit_list = list(filter(lambda x: x!= "πνεῦμα", conc_normal_spirit_list))


In [35]:
### Normalized frequency distribution of the holy spirit concordance data (i.e. normalized BAG OF WORDS)
conc_holy_spirit_fd = nltk.FreqDist([word for word in conc_holy_spirit_list])
for word in conc_holy_spirit_fd:
    conc_holy_spirit_fd[word] /= float(len(conc_holy_spirit_list))

### Normalized frequency distribution of the normal spirit concordance data
conc_spirit_fd = nltk.FreqDist([word for word in conc_normal_spirit_list])
for word in conc_spirit_fd:
    conc_spirit_fd[word] /= float(len(conc_normal_spirit_list))
    
    
### Normalized freqeuncy distribution of respective the GNT data
GNT_working_norm_fd = nltk.FreqDist([word for word in GNT_working.split()])
GNT_working_len = len(GNT_working.split())
for word in GNT_working_norm_fd:
    GNT_working_norm_fd[word] /= float(GNT_working_len)

    
conc_holy_spirit_df = pd.DataFrame(conc_holy_spirit_fd.most_common(20), columns=["lemma", "conc_HS"])
conc_spirit_df = pd.DataFrame(conc_spirit_fd.most_common(1000), columns=["lemma", "conc_spirit"])
GNT_fd_df = pd.DataFrame(GNT_working_norm_fd.most_common(1000), columns=["lemma", "GNT"])

fd_pd_combined = pd.merge(conc_holy_spirit_df, conc_spirit_df, how="left", on="lemma")
fd_pd_combined = pd.merge(fd_pd_combined, GNT_fd_df, how="left", on="lemma")


### if you wish a better graph change to "ascending=True" 
fd_pd_combined = fd_pd_combined.sort_values(by=["conc_HS"], ascending=True)
fd_pd_combined = fd_pd_combined.round(5)
fd_pd_combined

Unnamed: 0,lemma,conc_HS,conc_spirit,GNT
19,δίδωμι,0.00877,0.01009,0.005
17,δύναμις,0.01003,0.00348,0.00144
16,πίμπλημι,0.01003,,0.00029
15,πατήρ,0.01003,0.00626,0.00497
14,χείρ,0.01003,0.00139,0.00213
13,μή,0.01003,0.01496,0.0128
18,ἀπό,0.01003,0.0087,0.00781
12,κύριος,0.01128,0.01218,0.00867
11,διά,0.01253,0.01079,0.00805
10,λαμβάνω,0.01253,0.00557,0.00312


In [36]:
## just to look at the most common 50 in HS concordance

pd.DataFrame(conc_spirit_fd.most_common(50), columns=["lemma", "conc_spirit"])

Unnamed: 0,lemma,conc_spirit
0,ἐν,0.058803
1,εἰμί,0.036882
2,θεός,0.027488
3,οὐ,0.022965
4,λέγω,0.022269
5,εἰς,0.017397
6,ἐκ,0.016701
7,σάρξ,0.015658
8,μή,0.014962
9,ἰησοῦς,0.014614


In [37]:
### GRAPH
### generated by means of Plotly

data = [
     go.Bar(
        y=fd_pd_combined["lemma"],
        x=fd_pd_combined["GNT"],
        name= "NormFD of the term in the GNT text",
        orientation = 'h'
        ),   
    go.Bar(
       y=fd_pd_combined["lemma"],
        x=fd_pd_combined["conc_spirit"],
        name= "NormFD of the term in πνεῦμα conc.",
        orientation = 'h'
        ),
    go.Bar(
        y=fd_pd_combined["lemma"],
        x=fd_pd_combined["conc_HS"],
        name= "NormFD of the term in πνεῦμα_ἅγιον conc.",
        orientation = 'h',
        ),
    ]

layout = go.Layout(
    barmode='group',
    title='Holy Spirit Concorandce Distributions',
    autosize=False,
    width=700,
    height=700,
    legend=dict(
        x=0.4,
        y=0.05,
        traceorder='reversed',
        font=dict(
            family='sans-serif',
            size=12,
            color='#000'
        ),
        ## bgcolor='#E2E2E2',
        ## bordercolor='#FFFFFF',
        ## borderwidth=2
    )
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='holy_spirit')

In [38]:
## drawing table of the same

trace = go.Table(
    columnwidth = [100,180,180,180],
    header=dict(values=["lemma", "normFD πν_ἅγ", "normFD πνεῦμα", "normFD GNT"], ###fd_pd_combined.columns],
                fill = dict(color='#C2D4FF'),
                align = ['left'] * 3),
    cells=dict(values=[fd_pd_combined.lemma, fd_pd_combined.conc_HS, fd_pd_combined.conc_spirit, fd_pd_combined.GNT],
               fill = dict(color='#F5F8FF'),
               align = ['left'] * 3))
layout = dict(width=550, height=650)
data = [trace]
fig = dict(data=data, layout=layout)
py.iplot(fig, filename = 'pandas_table')