# Evaluation

In [1]:
from string import punctuation
import sys
import numpy as np
import scipy.stats
import pickle
import re
import xml.etree.ElementTree as ET
import glob
import random
sys.path.insert(0, 'utils/')
from syllabipymhg import syllabipymhg
from get_features import syllableend
from clean_text import cleantext

## orthographic resolution

list of special characters to resolve:

In [2]:
to_rep = [("ſ", "s"),
         ("vͦ", "uo"),
         ("oͤ", "ö"),
         ("⌊", ""),
         ("/", ""),
         ("·", ""),
         ("-", ""),
         ("u̍", "u"),
         ("uͦ", "uo"),
         ("uͤ", "ü"),
         ("oͮ", "ou"),
         ("ẘ", "wo"),
         ("vͦ", "vo"),
         ("dͤ", "de")
         ("uu", "w"),
         ("†", ""),
         ("¦", " "),
         ("\uf1f0", ""),
         ("div ", "diu ")]

'v' and 'u' resolution:

In [3]:
words = []
for fname in glob.glob('data/*.txt'):
    with open(fname, 'r') as f:
        words.extend(cleantext(f.read()).split())

v_words = list(set([w.lower() for w in words if "v" in w.lower()]))

environments = []
for w in v_words:
    v_ind = w.find("v")
    if v_ind == 0:
        environments.append(w[:2])
    elif v_ind == len(w):
        environments.append(w[v_ind-1:])
    else:
        environments.append(w[v_ind-1:v_ind+2])

environments = [e for e in environments if len(e) > 1]
environments = list(set(environments))

resolution function

In [4]:
def resolve(text):
    for r in to_rep:
        text = text.replace(r[0], r[1])
        
    words = text.split()
    
    resolved = []
    for w in words:
        envi = []
        if "v" in w:
            v_ind = w.find("v")
            if v_ind == 0:
                envi.append(w[:2])
            elif v_ind == len(w):
                envi.append(w[v_ind-1:])
            else:
                envi.append(w[v_ind-1:v_ind+2])
            
        for e in envi:
            if e in environments:
                pass
            else:
                w = w.replace(e, e.replace("v", "u"))
        resolved.append(w)
    
    return(' '.join(resolved))

## LDM evaluation

In [5]:
def get_stats(test_poem):

    test_poem = resolve(cleantext(test_poem))

    open_s = 0
    closed_s = 0

    sylls = [syllabipymhg(w) for w in test_poem.split()]

    for w in sylls:
        for s in w:
            if syllableend(s) == "O":
                open_s += 1
            else:
                closed_s += 1

    return ((open_s/(open_s + closed_s)), open_s + closed_s, test_poem, sylls)

In [6]:
all_diffs_ss = []
all_diffs_num = []
all_seq_lev = []

for n in ["eist", "dwa", "rubin"]:
    standard = pickle.load(open("data/all_" + n + "_standard.pkl", "rb"))
    trans = pickle.load(open("data/all_" + n + "_no_ab.pkl", "rb"))

    transkriptions = [item for sublist in trans for item in sublist]
    standards = [item for sublist in standard for item in sublist]

    diffs_ss = []
    diffs_num = []
    stanzas = 0

    for i in range(len(transkriptions)):
        transc = get_stats(transkriptions[i])
        stand = get_stats(standards[i])
        
        if len(transc[-2].split('\n')) == len(stand[-2].split('\n')):
        
            diffs = abs(transc[0]-stand[0])
            if diffs < .05:
                diffs_ss.extend([diffs]*stand[1])
                all_diffs_ss.extend([diffs]*stand[1])
                stanzas += 1
            else:
                pass
#                 print(transc[-1], transc[0])
#                 print()
#                 print(stand[-1], stand[0])
#                 print()
    
    print(stanzas)
    print(n)
    print(np.mean(diffs_ss))
    print(np.std(diffs_ss))
    print("*****")

print()
print(np.mean(all_diffs_ss))
print(np.std(all_diffs_ss))

49
eist
0.0079262347736
0.0100029084096
*****
34
dwa
0.0109465298193
0.00988083724685
*****
71
rubin
0.00805357588665
0.00892067320934
*****

0.00849461313693
0.00949587190914


## REM evaluation

In [7]:
def get_dipl(fname):
    tree = ET.parse(fname)
    root = tree.getroot()
    
    t_string = ""
    for i in range(len(root)):
        try:
            tok = root[i][0].attrib["utf"]
        except KeyError:
            continue
        except IndexError:
            continue
            
        t_string += tok + " "
        
    t_string = t_string.replace(" . ", "\n")
        
    return t_string

In [8]:
def get_norm(fname):
    tree = ET.parse(fname)
    root = tree.getroot()
    
    t_string = ""
    for i in range(len(root)):
        try:
            tok = root[i][1][0].attrib['tag']
        except KeyError:
            continue
        except IndexError:
            continue
            
        t_string += tok + " "
        
    t_string = t_string.replace(" $_ ", "\n")
        
    return t_string

In [9]:
def get_norm(fname):
    tree = ET.parse(fname)
    root = tree.getroot()
    
    t_string = ""
    for i in range(len(root)):
        try:
            tok = root[i][1][0].attrib['tag']
        except KeyError:
            continue
        except IndexError:
            continue
            
        t_string += tok + " "
        
    t_string = t_string.replace(" $_ ", "\n")
        
    return t_string

In [10]:
all_texts = ""

for fname in glob.glob("/Users/chench/Downloads/rem-coraxml-20161222/*.xml"):
    all_texts += get_dipl(fname) + "\n\n"

words = resolve(all_texts).split()

In [11]:
sylls = [syllabipymhg(w) for w in words]
sylls = [x for x in sylls if len(x) > 0]

In [12]:
sample_inds = random.sample(range(0, len(sylls)), 1000)
sample = [sylls[i] for i in sample_inds]

In [13]:
print(len(sample))
sample

1000


[['hoch'],
 ['er'],
 ['liht'],
 ['a', 'breh', 'tes'],
 ['wil'],
 ['goz'],
 ['daz'],
 ['sol'],
 ['niet'],
 ['ger'],
 ['tar', 'sil', 'le'],
 ['cur', 'ne', 'wal'],
 ['swaz'],
 ['hai', 'li', 'ge'],
 ['e'],
 ['nyt'],
 ['man', 'nen'],
 ['den'],
 ['diu'],
 ['díu'],
 ['we', 'relt'],
 ['die'],
 ['chin', 'den'],
 ['stil', 'le'],
 ['für', 'bas'],
 ['was'],
 ['níht'],
 ['len', 'gen', 'vel', 'der'],
 ['die'],
 ['mus'],
 ['guot', 'li', 'hen'],
 ['helm'],
 ['be', 'sni', 'dun', 'ge'],
 ['en'],
 ['um', 'be'],
 ['sun', 'di', 'ge'],
 ['ob'],
 ['pert'],
 ['diu'],
 ['wie'],
 ['un̄'],
 ['ha', 'bent'],
 ['e', 'nand'],
 ['ku', 'met'],
 ['lu', 'tir'],
 ['den', 'ne'],
 ['i', 'ren'],
 ['tou', 'fen'],
 ['docht'],
 ['niht'],
 ['vrou', 'wen'],
 ['sol', 'dis'],
 ['der'],
 ['dan'],
 ['die'],
 ['ū'],
 ['iz'],
 ['liu', 'te'],
 ['un̄'],
 ['dar'],
 ['noch'],
 ['was'],
 ['reh', 'te'],
 ['ein'],
 ['díe'],
 ['suo', 'len'],
 ['sinˢ'],
 ['dinch'],
 ['wâz', 'zer'],
 ['me', 'ni', 'ge'],
 ['u', 'liu', 'get'],
 ['fuoz'],
 ['waz'

In [None]:
vrte - lei
zee ren
di -- trich