In [126]:
import nltk
import pandas as pd

from collections import Counter

In [127]:
lines_per_book = [444, 433, 497, 847, 493, 331, 347, 586, 566] # up to book 9
west_redacted = ['1.171', '2.407', '3.131', '4.248']

In [128]:
gr_char = []
for i in range(880, 1024): # greek letters
    if i!=903: # punctuation
        gr_char.append(i)
for i in range(7936, 8192): # greek extended
    gr_char.append(i)

In [129]:
instances = pd.read_csv('ship_instances.csv')
instances.head()

Unnamed: 0,book,line,ship,epithet_gr,epithet_en,number,case,clause,scansion
0,1,61,νηυσὶ,,,p,d,οὔ νύ τ’ Ὀδυσσεύς | Ἀργείων παρὰ νηυσὶ χαρίζετ...,-- -uu -uu -uu -uu --
1,1,171,νηὸς,,,s,g,ὁπποίης δ’ ἐπὶ νηὸς ἀφίκεο;,-- -uu -uu -uu -uu --
2,1,182,νηῒ,,,s,d,νῦν δ’ ὧδε ξὺν νηῒ κατήλυθον ἠδ’ ἑτάροισιν | π...,-- -- -uu -uu -uu --
3,1,185,νηῦς,,,s,n,"νηῦς δέ μοι ἥδ’ ἕστηκεν ἐπ’ ἀγροῦ νόσφι πόληος,",-uu -uu -uu -- -uu --
4,1,211,νηυσίν,κοίληις,hollow,p,d,ἔνθά περ ἄλλοι | Ἀργείων οἱ ἄριστοι ἔβαν κοίλη...,-- -uu -uu -- -uu --


In [130]:
## set column types
instances['book'] = instances['book'].astype(int)
instances['line'] = instances['line'].astype(int)
instances['ship'] = instances['ship'].astype(str)
instances['epithet_gr'] = instances['epithet_gr'].astype(str)
instances['epithet_en'] = instances['epithet_en'].astype(str)
instances['number'] = instances['number'].astype(str)
instances['case'] = instances['case'].astype(str)
instances['clause'] = instances['clause'].astype(str)
instances['scansion'] = instances['scansion'].astype(str)

In [131]:
## count occurences of the label
gr_dict = Counter(instances['epithet_gr'])
en_dict = Counter(instances['epithet_en'])

In [132]:
print(gr_dict)

Counter({'nan': 73, 'θοὴν': 13, 'μέλαιναν': 7, 'μελαίνηι': 5, 'κοίληις': 3, 'ἐΰσσελμοι': 3, 'ἐΐσας': 3, 'ἐπήρετμοι': 3, 'ἀμφιέλισσαι': 3, 'θοῆς': 2, 'κοίλης': 2, 'ἐΐσης': 2, 'γλαφυρῆισι': 2, 'κοίλην': 2, 'ἐῗσαι': 2, 'θοῆισιν': 2, 'ἐϋσσέλμους': 2, 'κυανοπρώιροιο': 2, 'θοήν': 1, 'ἐϋσσέλμωι': 1, 'θοῆι': 1, 'ἀμφιελίσσας': 1, 'θεούσης': 1, 'κυανοπρωιρείους': 1, 'κοίληι': 1, 'θοῆισι': 1, 'θοὰς': 1, 'γλαφυρὴ': 1, 'ἐϋσσέλμοισιν': 1, 'δολιχηρέτμοισιν': 1, 'ἐΐσηις': 1, 'ὠκυπόρων': 1, 'ὠκύποροι': 1, 'εὐρείης': 1, 'μελαινάων': 1, 'ὠκεῖαι': 1, 'ἀμφιελίσσης': 1, 'πολυκλήϊδι': 1, 'ἐϋσσέλμων': 1, 'θοῆις': 1, 'γλαφυρῆισιν': 1, 'ὠκειάων': 1, 'εὐεργέα': 1, 'μελαίνης': 1, 'γλαφυρῆς': 1})


In [133]:
print(en_dict)

Counter({'nan': 73, 'swift': 22, 'black': 14, 'hollow': 8, 'well-benched': 8, 'well-balanced': 8, 'curved': 5, 'hollowed': 5, 'dark-prowed': 3, 'with oars': 3, 'swift-sailing': 2, 'quick': 2, 'running': 1, 'long-oared': 1, 'broad': 1, 'with many benches': 1, 'well-made': 1})


In [134]:
## featurise

num_lines = []
punc = []
positions = []
differences = []
word_before = []
word_after = []
ratio = []
bigrams = []
trigrams = []

for index, row in instances.iterrows():

    punc.append(row['clause'][-1])

    ## find difference in epithet position
    phrase = row['clause'][:-1]
    words = phrase.replace(' | ', ' ').split(' ')
    if row['epithet_gr'] == 'nan':
        difference = 0
    else:
        difference = words.index(row['epithet_gr']) - words.index(row['ship'])
    differences.append(difference)

    ## find word before ship in the clause
    if words.index(row['ship']) == 0:
        before = 'nan'
    else:
        before = words[words.index(row['ship'])-1]
    word_before.append(before)
    ## find word after ship in the clause
    if words.index(row['ship']) == (len(words)-1):
        after = 'nan'
    else:
        after = words[words.index(row['ship'])+1]
    word_after.append(after)

    ## bi- and tri-grams
    if row['epithet_gr'] != 'nan':
        clause = phrase.replace(' | ', ' ').replace(row['epithet_gr'], '')
    else:
        clause = phrase.replace(' | ', ' ')
    nltk_tokens = clause.split(' ')
    bi = list(nltk.bigrams(nltk_tokens))
    tri = list(nltk.trigrams(nltk_tokens))
    bigrams.append(bi)
    trigrams.append(tri)


    ## split clause into lines
    lines = row['clause'].split(' | ')
    num_lines.append(len(lines))

    for line in lines:
        if row['ship'] in line:
            words = line.split(' ')                
            filtered = [word for word in words if word!=row['epithet_gr']]

            ## find position of ship in the line
            if filtered[0] == row['ship']:
                pos = 'start'
            elif filtered[-1] == row['ship']:
                pos = 'end'
            else: 
                pos = 'mid'
            positions.append(pos)
    
    ## find ratio of dactyls to spondees
    feet = row['scansion'].split(' ')
    dactyl = 0
    for foot in feet:
        if foot == '-uu':
            dactyl += 1
    ratio.append(dactyl/6) # six feet per line

## add features to dataframe
instances['num_lines'] = num_lines
instances['punctuation'] = punc
instances['position'] = positions
instances['difference'] = differences
instances['before'] = word_before
instances['after'] = word_after
instances['ratio'] = ratio
instances['bigrams'] = bigrams
instances['trigrams'] = trigrams

### Notes

Do I need before and after if using bigrams?

In [135]:
instances.head()

Unnamed: 0,book,line,ship,epithet_gr,epithet_en,number,case,clause,scansion,num_lines,punctuation,position,difference,before,after,ratio,bigrams,trigrams
0,1,61,νηυσὶ,,,p,d,οὔ νύ τ’ Ὀδυσσεύς | Ἀργείων παρὰ νηυσὶ χαρίζετ...,-- -uu -uu -uu -uu --,3,;,mid,0,παρὰ,χαρίζετο,0.666667,"[(οὔ, νύ), (νύ, τ’), (τ’, Ὀδυσσεύς), (Ὀδυσσεύς...","[(οὔ, νύ, τ’), (νύ, τ’, Ὀδυσσεύς), (τ’, Ὀδυσσε..."
1,1,171,νηὸς,,,s,g,ὁπποίης δ’ ἐπὶ νηὸς ἀφίκεο;,-- -uu -uu -uu -uu --,1,;,mid,0,ἐπὶ,ἀφίκεο,0.666667,"[(ὁπποίης, δ’), (δ’, ἐπὶ), (ἐπὶ, νηὸς), (νηὸς,...","[(ὁπποίης, δ’, ἐπὶ), (δ’, ἐπὶ, νηὸς), (ἐπὶ, νη..."
2,1,182,νηῒ,,,s,d,νῦν δ’ ὧδε ξὺν νηῒ κατήλυθον ἠδ’ ἑτάροισιν | π...,-- -- -uu -uu -uu --,2,",",mid,0,ξὺν,κατήλυθον,0.5,"[(νῦν, δ’), (δ’, ὧδε), (ὧδε, ξὺν), (ξὺν, νηῒ),...","[(νῦν, δ’, ὧδε), (δ’, ὧδε, ξὺν), (ὧδε, ξὺν, νη..."
3,1,185,νηῦς,,,s,n,"νηῦς δέ μοι ἥδ’ ἕστηκεν ἐπ’ ἀγροῦ νόσφι πόληος,",-uu -uu -uu -- -uu --,1,",",start,0,,δέ,0.666667,"[(νηῦς, δέ), (δέ, μοι), (μοι, ἥδ’), (ἥδ’, ἕστη...","[(νηῦς, δέ, μοι), (δέ, μοι, ἥδ’), (μοι, ἥδ’, ἕ..."
4,1,211,νηυσίν,κοίληις,hollow,p,d,ἔνθά περ ἄλλοι | Ἀργείων οἱ ἄριστοι ἔβαν κοίλη...,-- -uu -uu -- -uu --,2,·,mid,-2,ἐνὶ,,0.5,"[(ἔνθά, περ), (περ, ἄλλοι), (ἄλλοι, Ἀργείων), ...","[(ἔνθά, περ, ἄλλοι), (περ, ἄλλοι, Ἀργείων), (ἄ..."


In [136]:
## correlation between numeric values