# Evaluation

In [1]:
from string import punctuation
import sys
import numpy as np
import scipy.stats
import pickle
import re
import xml.etree.ElementTree as ET
import glob
import random
sys.path.insert(0, 'utils/')
from syllabipymhg import syllabipymhg
from util import syllableend, cleantext

## Orthographic Resolution

List of special characters to resolve:

In [2]:
to_rep = [("ſ", "s"),
         ("vͦ", "uo"),
         ("oͤ", "ö"),
         ("⌊", ""),
         ("/", ""),
         ("·", ""),
         ("-", ""),
         ("u̍", "u"),
         ("uͦ", "uo"),
         ("uͤ", "ü"),
         ("oͮ", "ou"),
         ("ẘ", "wo"),
         ("vͦ", "vo"),
         ("dͤ", "de"),
         ("uu", "w"),
         ("ˢ", "e"),
         ("ᵉ", "e"),
         ("ᵒ", "o"),
         ("†", ""),
         ("¦", " "),
         ("\uf1f0", ""),
         ("div ", "diu "),
         ("vnde", "unde")]

'v' and 'u' resolution:

In [3]:
words = []
for fname in glob.glob('data/*.txt'):
    with open(fname, 'r') as f:
        words.extend(cleantext(f.read()).split())

v_words = list(set([w.lower() for w in words if "v" in w.lower()]))

environments = []
for w in v_words:
    v_ind = w.find("v")
    if v_ind == 0:
        environments.append(w[:2])
    elif v_ind == len(w):
        environments.append(w[v_ind-1:])
    else:
        environments.append(w[v_ind-1:v_ind+2])

environments = [e for e in environments if len(e) > 1]
environments = list(set(environments))

resolution function:

In [4]:
def resolve(text):
    for r in to_rep:
        text = text.replace(r[0], r[1])
        
    words = text.split()
    
    resolved = []
    for w in words:
        envi = []
        if "v" in w:
            v_ind = w.find("v")
            if v_ind == 0:
                envi.append(w[:2])
            elif v_ind == len(w):
                envi.append(w[v_ind-1:])
            else:
                envi.append(w[v_ind-1:v_ind+2])
            
        for e in envi:
            if e in environments:
                pass
            else:
                w = w.replace(e, e.replace("v", "u"))
        resolved.append(w)
    
    return(' '.join(resolved))

## LDM Evaluation

In [5]:
def get_stats(test_poem):

    test_poem = resolve(cleantext(test_poem))

    open_s = 0
    closed_s = 0

    sylls = [syllabipymhg(w) for w in test_poem.split()]

    for w in sylls:
        for s in w:
            if syllableend(s) == "O":
                open_s += 1
            else:
                closed_s += 1

    return ((open_s/(open_s + closed_s)), open_s + closed_s, test_poem, sylls)

In [6]:
all_diffs_ss = []
all_diffs_num = []
all_seq_lev = []
all_stanzas = 0

for n in ["eist", "dwa", "rubin"]:
    standard = pickle.load(open("data/all_" + n + "_standard.pkl", "rb"))
    trans = pickle.load(open("data/all_" + n + "_no_ab.pkl", "rb"))

    transkriptions = [item for sublist in trans for item in sublist]
    standards = [item for sublist in standard for item in sublist]

    diffs_ss = []
    diffs_num = []
    stanzas = 0

    for i in range(len(transkriptions)):
        transc = get_stats(transkriptions[i])
        stand = get_stats(standards[i])
        
        # check for same number of lines
        if len(transc[-2].split('\n')) == len(stand[-2].split('\n')):
        
            diffs = abs(transc[0]-stand[0])
            diffs_ss.extend([diffs]*stand[1])
            all_diffs_ss.extend([diffs]*stand[1])
            stanzas += 1
            
    all_stanzas += stanzas
    
    print(stanzas)
    print(n)
    print(np.mean(diffs_ss))
    print(np.std(diffs_ss))
    print("*****")

print()
print("total")
print(all_stanzas)
print(np.mean(all_diffs_ss))
print(np.std(all_diffs_ss))

49
eist
0.0079262347736
0.0100029084096
*****
34
dwa
0.0109465298193
0.00988083724685
*****
71
rubin
0.00821974404883
0.00887605102043
*****

total
154
0.0085813058114
0.00947035869198


## REM Evaluation

In [7]:
def get_dipl(fname):
    tree = ET.parse(fname)
    root = tree.getroot()
    
    t_string = ""
    for i in range(len(root)):
        try:
            tok = root[i][0].attrib["utf"]
        except KeyError:
            continue
        except IndexError:
            continue
            
        t_string += tok + " "
        
    t_string = t_string.replace(" . ", "\n")
        
    return t_string

In [8]:
def get_norm(fname):
    tree = ET.parse(fname)
    root = tree.getroot()
    
    t_string = ""
    for i in range(len(root)):
        try:
            tok = root[i][1][0].attrib['tag']
        except KeyError:
            continue
        except IndexError:
            continue
            
        t_string += tok + " "
        
    t_string = t_string.replace(" $_ ", "\n")
        
    return t_string

In [9]:
def get_norm(fname):
    tree = ET.parse(fname)
    root = tree.getroot()
    
    t_string = ""
    for i in range(len(root)):
        try:
            tok = root[i][1][0].attrib['tag']
        except KeyError:
            continue
        except IndexError:
            continue
            
        t_string += tok + " "
        
    t_string = t_string.replace(" $_ ", "\n")
        
    return t_string

In [10]:
all_texts = ""

for fname in glob.glob("/Users/chench/Downloads/rem-coraxml-20161222/*.xml"):
    all_texts += get_dipl(fname) + "\n\n"
  bcvx
words = resolve(all_texts).split()

In [11]:
sylls = [syllabipymhg(w) for w in words]
sylls = [x for x in sylls if len(x) > 0]

In [12]:
random.seed(1)
sample_inds = random.sample(range(0, len(sylls)), 1000)
sample = [sylls[i] for i in sample_inds]

In [13]:
print(len(sample))
for i,s in enumerate(sample):
    print(i+1, s)

1000
1 ['de']
2 ['un', 'se', 're']
3 ['uō']
4 ['sin']
5 ['van']
6 ['dem']
7 ['las']
8 ['nus', 'sche', 'len']
9 ['ir']
10 ['golt']
11 ['ei', 'nes']
12 ['ei', 'ne']
13 ['ve', 'seu', 'mē']
14 ['sich']
15 ['her']
16 ['wart']
17 ['ein']
18 ['sol']
19 ['ín', 'flín']
20 ['als']
21 ['niht']
22 ['den']
23 ['so']
24 ['mit']
25 ['dat']
26 ['auch']
27 ['ei', 'nē']
28 ['ge', 'bot']
29 ['hus', 'ge', 'nos']
30 ['uns']
31 ['an']
32 ['ge', 'seít']
33 ['ge', 'rich', 'te']
34 ['ge', 'ziu', 'ge']
35 ['heý']
36 ['kun', 'der']
37 ['zuo']
38 ['man']
39 ['an']
40 ['e', 'zec', 'hen', 'den']
41 ['uns']
42 ['be', 'want']
43 ['man']
44 ['ab', 'bet']
45 ['un', 'gelt']
46 ['von']
47 ['suo', 'i', 'len']
48 ['bri', 'sen']
49 ['ez']
50 ['un', 'de']
51 ['un', 'de']
52 ['it']
53 ['pli', 'ge', 'rí', 'ne']
54 ['du']
55 ['golt']
56 ['ein']
57 ['pei']
58 ['die']
59 ['ur', 'sten', 'de']
60 ['un̄']
61 ['si']
62 ['sprah']
63 ['den']
64 ['da', 'ruof']
65 ['si']
66 ['ges', 'ter', 'ket']
67 ['um', 'me']
68 ['ta', 'ge']
69 ['he']


Errors:

1. `lí-ste-klîch` : suffix lîch
2. `vrloup` : should begin with 'u'
3. `vol-chnant` : compound word, volch-nant
4. `iu-sti-ciam` : latin word iu-sti-ci-am
5. `bec-ho-run-ge` : prefix be-cho-run-ge
6. `frae-u-e-len` : should be 'v', frae-ve-len