In [1]:
import sys, os, collections
from tf.fabric import Fabric
DATABASE = '~/Programming/tf-github'
BHSA = 'bhsa/tf/2017'
TF = Fabric(locations=[DATABASE], modules=[BHSA], silent=False)

api = TF.load('''
    book chapter verse
    txt kind rela typ domain
''')
api.makeAvailableIn(globals())

This is Text-Fabric 3.1.1
Api reference : https://github.com/Dans-labs/text-fabric/wiki/Api
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

114 features found and 0 ignored
  0.00s loading features ...
   |     0.02s B book                 from /home/jcuenod/Programming/tf-github/bhsa/tf/2017
   |     0.01s B chapter              from /home/jcuenod/Programming/tf-github/bhsa/tf/2017
   |     0.02s B verse                from /home/jcuenod/Programming/tf-github/bhsa/tf/2017
   |     0.26s T txt                  from /home/jcuenod/Programming/tf-github/bhsa/tf/2017
   |     0.05s B kind                 from /home/jcuenod/Programming/tf-github/bhsa/tf/2017
   |     0.18s B rela                 from /home/jcuenod/Programming/tf-github/bhsa/tf/2017
   |     0.18s B typ                  from /home/jcuenod/Programming/tf-github/bhsa/tf/2017
   |     0.04s B domain               from /home/j

# Set up data points

Here we set up all the data points that we will analyse (listed here mainly in order to have a handy reference).

`is_Poetry` will be the dependent variable and, hopefully we will find some kind of correlation to other points.

In [25]:
data_points = {
    "is_Poetry": 0,
    "depth": 0,
#     "txt": 0,
    "is_Kind_VC": 0,
    "is_Kind_WP": 0,
    "is_Kind_NC": 0,
    "is_Rela_ReVo": 0,
    "is_Rela_Coor": 0,
    "is_Rela_Attr": 0,
    "is_Rela_Cmpl": 0,
    "is_Rela_Resu": 0,
    "is_Rela_Adju": 0,
    "is_Rela_Objc": 0,
    "is_Rela_RgRc": 0,
    "is_Rela_Subj": 0,
    "is_Rela_PrAd": 0,
    "is_Rela_PreC": 0,
    "is_Rela_Spec": 0,
    "is_Typ_AjCl": 0,
    "is_Typ_CPen": 0,
    "is_Typ_Defc": 0,
    "is_Typ_Ellp": 0,
    "is_Typ_InfA": 0,
    "is_Typ_InfC": 0,
    "is_Typ_MSyn": 0,
    "is_Typ_NmCl": 0,
    "is_Typ_Ptcp": 0,
    "is_Typ_Reop": 0,
    "is_Typ_Unkn": 0,
    "is_Typ_Voct": 0,
    "is_Typ_Way0": 0,
    "is_Typ_WayX": 0,
    "is_Typ_WIm0": 0,
    "is_Typ_WImX": 0,
    "is_Typ_WQt0": 0,
    "is_Typ_WQtX": 0,
    "is_Typ_WxI0": 0,
    "is_Typ_WXIm": 0,
    "is_Typ_WxIX": 0,
    "is_Typ_WxQ0": 0,
    "is_Typ_WXQt": 0,
    "is_Typ_WxQX": 0,
    "is_Typ_WxY0": 0,
    "is_Typ_WXYq": 0,
    "is_Typ_WxYX": 0,
    "is_Typ_WYq0": 0,
    "is_Typ_WYqX": 0,
    "is_Typ_xIm0": 0,
    "is_Typ_XImp": 0,
    "is_Typ_xImX": 0,
    "is_Typ_XPos": 0,
    "is_Typ_xQt0": 0,
    "is_Typ_XQtl": 0,
    "is_Typ_xQtX": 0,
    "is_Typ_xYq0": 0,
    "is_Typ_XYqt": 0,
    "is_Typ_xYqX": 0,
    "is_Typ_ZIm0": 0,
    "is_Typ_ZImX": 0,
    "is_Typ_ZQt0": 0,
    "is_Typ_ZQtX": 0,
    "is_Typ_ZYq0": 0,
    "is_Typ_ZYqX": 0,
    "is_Domain_N": 0,
    "is_Domain_Q": 0,
    "is_Domain_D": 0
}

# Define some helper methods

`inScope` just limits us to the corpus for which we have defined which sections are poetic.

`isPoetry` is going to tell us whether we should define a particular node as poetic (this is based on rough guestimates eyeballing my way through an English Bible - i.e. completely unreliable at this point but good enough for a ballpark). Some things that may be worth improving here are looking at the clauses at the start of poetry because often those have wayyiqtols (or other unusual features), for example, and may significantly affect data in the bigger picture.

In [32]:
scope = ["Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy", "1_Samuel", "2_Samuel", "Psalms", "Proverbs"]
def inScope(node):
    book, chapter, verse = T.sectionFromNode(node)
    return book in scope

def isPoetry(node):
    book, chapter, verse = T.sectionFromNode(node)
    if book == "Psalms" or book == "Proverbs":
        return True
    elif book == "Genesis":
        if chapter == 49 and verse > 1 and verse < 28:
            return True
    elif book == "Exodus":
        if chapter == 15 and verse < 19:
            return True
    elif book == "Numbers":
        if chapter == 21 and verse in [15, 18, 27, 28, 29, 30]:
            return True
        if chapter == 23 and ((verse > 6 and verse < 11) or (verse > 17 and verse < 24)):
            return True
        if chapter == 24 and ((verse > 2 and verse < 10) or (verse > 14 and verse < 25)):
            return True
    elif book == "Deuteronomy":
        if chapter == 28 and ((verse > 2 and verse < 7) or (verse > 15 and verse < 20)):
            return True
        if chapter == 32 and verse < 44:
            return True
        if chapter == 33:
            return True

# Build Data Set

Now we cycle through all the clauses, if they're in our scope, we add their data points (most importantly, whether or not they are poetry).

In [33]:
failures = {}
def addFail(key):
    if key not in failures:
        failures[key] = 0
    failures[key] += 1

progress = 0
data_set = []
for c in F.otype.s('clause'):
    progress += 1
    if progress % 25000 == 0:
        print(progress)
    if inScope(c):
        curr_data = data_points.copy()
        curr_data["is_Poetry"] = 1 if isPoetry(c) else 0
        
        curr_data["depth"] = len(F.txt.v(c))
#         curr_data["txt"] = F.txt.v(c)
        
        if "is_Rela_" + F.rela.v(c) in curr_data:
            curr_data["is_Rela_" + F.rela.v(c)] = 1
        else:
            addFail("is_Rela_" + F.rela.v(c))
        if "is_Domain_" + F.domain.v(c) in curr_data:
            curr_data["is_Domain_" + F.domain.v(c)] = 1
        else:
            addFail("is_Domain_" + F.domain.v(c))
        if "is_Typ_" + F.typ.v(c) in curr_data:
            curr_data["is_Typ_" + F.typ.v(c)] = 1
        else:
            addFail("is_Typ_" + F.typ.v(c))
        if "is_Kind_" + F.kind.v(c) in curr_data:
            curr_data["is_Kind_" + F.kind.v(c)] = 1
        else:
            addFail("is_Kind_" + F.kind.v(c))
        
        data_set.append(curr_data)

print("done")

25000
50000
75000
done


Now let's just check that we haven't left out any values as we added them to our data set.

In [42]:
print(failures)

{'is_Rela_NA': 29021, 'is_Domain_?': 1644}


# Build DataFrame

This is just a matter of shoving all those datapoints into a pandas object.

In [38]:
from pandas import DataFrame
pdd = DataFrame.from_records(data_set)

pdd.shape

(38049, 65)

Let's write this data to a csv file so that we can use it without doing the crazy processing all the time in the future...

In [39]:
pdd.to_csv("poetry_data.csv")

# Multiple Regression

Let's try a first attempt at multiple regression.

If you're wondering what these things mean, well so am I mostly but you can have a look here:

1. For Rela: https://etcbc.github.io/bhsa/features/hebrew/2017/rela
2. For Typ: https://etcbc.github.io/bhsa/features/hebrew/2017/typ
3. For Domain: https://etcbc.github.io/bhsa/features/hebrew/2017/domain

In [41]:
# create X and y
feature_cols = ["depth", "is_Kind_VC", "is_Kind_WP", "is_Kind_NC", "is_Rela_ReVo", "is_Rela_Coor", "is_Rela_Attr", "is_Rela_Cmpl", "is_Rela_Resu", "is_Rela_Adju", "is_Rela_Objc", "is_Rela_RgRc", "is_Rela_Subj", "is_Rela_PrAd", "is_Rela_PreC", "is_Rela_Spec", "is_Typ_AjCl", "is_Typ_CPen", "is_Typ_Defc", "is_Typ_Ellp", "is_Typ_InfA", "is_Typ_InfC", "is_Typ_MSyn", "is_Typ_NmCl", "is_Typ_Ptcp", "is_Typ_Reop", "is_Typ_Unkn", "is_Typ_Voct", "is_Typ_Way0", "is_Typ_WayX", "is_Typ_WIm0", "is_Typ_WImX", "is_Typ_WQt0", "is_Typ_WQtX", "is_Typ_WxI0", "is_Typ_WXIm", "is_Typ_WxIX", "is_Typ_WxQ0", "is_Typ_WXQt", "is_Typ_WxQX", "is_Typ_WxY0", "is_Typ_WXYq", "is_Typ_WxYX", "is_Typ_WYq0", "is_Typ_WYqX", "is_Typ_xIm0", "is_Typ_XImp", "is_Typ_xImX", "is_Typ_XPos", "is_Typ_xQt0", "is_Typ_XQtl", "is_Typ_xQtX", "is_Typ_xYq0", "is_Typ_XYqt", "is_Typ_xYqX", "is_Typ_ZIm0", "is_Typ_ZImX", "is_Typ_ZQt0", "is_Typ_ZQtX", "is_Typ_ZYq0", "is_Typ_ZYqX", "is_Domain_N", "is_Domain_Q", "is_Domain_D"]
# feature_cols = ["depth", "is_Kind_VC", "is_Kind_WP", "is_Kind_NC", "is_Domain_N", "is_Domain_Q", "is_Domain_D"]

X = pdd[feature_cols]
y = pdd.is_Poetry

# follow the usual sklearn pattern: import, instantiate, fit
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X, y)

# print intercept and coefficients
print(lm.intercept_)
list(zip(feature_cols, lm.coef_))

-1.73241608598e+12


[('depth', -0.1603949846336849),
 ('is_Kind_VC', 1613193902255.606),
 ('is_Kind_WP', 1768328350048.6147),
 ('is_Kind_NC', 2114820553912.6621),
 ('is_Rela_ReVo', 0.33807960551732996),
 ('is_Rela_Coor', 0.1729376975301056),
 ('is_Rela_Attr', -0.16340225488908011),
 ('is_Rela_Cmpl', -0.02309672770771698),
 ('is_Rela_Resu', -0.09634129007856948),
 ('is_Rela_Adju', -0.13481735448065013),
 ('is_Rela_Objc', -0.10665372343821609),
 ('is_Rela_RgRc', -0.10514376506693368),
 ('is_Rela_Subj', 0.10189633140753315),
 ('is_Rela_PrAd', -0.21370210408573992),
 ('is_Rela_PreC', -0.067629170116258713),
 ('is_Rela_Spec', -0.02406241698109348),
 ('is_Typ_AjCl', -382404467928.60931),
 ('is_Typ_CPen', -35912264064.676376),
 ('is_Typ_Defc', 31310101344.212196),
 ('is_Typ_Ellp', -35912264064.536659),
 ('is_Typ_InfA', 119222183728.36572),
 ('is_Typ_InfC', 119222183728.41867),
 ('is_Typ_MSyn', -35912264064.622314),
 ('is_Typ_NmCl', -382404467928.65149),
 ('is_Typ_Ptcp', 119222183728.48151),
 ('is_Typ_Reop', -359