# Kristijono Donelaičio 'Metų' hegzametras

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tabulate
from  matplotlib.ticker import FuncFormatter
import os
from pathlib import Path

sns.set_style("whitegrid")
pd.set_option("styler.format.decimal", ",")
pd.set_option("styler.format.precision", 2)
pd.set_option("styler.format.na_rep", "–")
cm = plt.cm.YlOrRd

subdirectory = Path("assets")
subdirectory.mkdir(exist_ok=True)

def write_figure(the_name):
    plt.savefig(subdirectory / f"figure-{the_name}.svg")

def write_table(df, the_name, caption="", heatmap=False, axis=None):
    style = df.style if not heatmap else df.style.background_gradient(axis=axis, cmap=cm)
    with (subdirectory / f"table-{the_name}.tex").open(mode="w") as f:
        f.write(
            (style).highlight_null(props="background-color:white; color:white;").to_latex(
                convert_css=True, 
                hrules=True,
                sparse_index=False,
                environment="longtable", 
                position="!hbt", 
                caption=caption, 
                label="tbl:"+the_name
            )
        )
    with (subdirectory / f"table-{the_name}.html").open(mode="w") as f:
        f.write((style).highlight_null(props="opacity:0").to_html())

In [2]:
all_caesuras = ["3h", "5h", "ktt", "7h", "pqt", "bd"]
df = pd.read_csv(os.getenv("EPIC_CSV") or "./metai-scansion.csv")
print(len(df))

In [3]:
incorrect_lines = df[df.isnull()["scansion"]]
df = df.dropna()
print(len(incorrect_lines))
incorrect_lines

In [4]:
ambiguous = df["scansion"].str.contains(r'\|')
ambiguous_lines = df[ambiguous]
df = df[~ambiguous]

In [5]:
print(len(ambiguous_lines))
print(len(df))
for _, row in ambiguous_lines.iterrows():
    print(str(row["book"]) + "." + str(row["verse"]) + " (" + row["scansion"].replace("|", "/") + ")", end=", ")

In [6]:
# convert data types
df["scansion"] = pd.Categorical(df["scansion"], sorted(df["scansion"].drop_duplicates().values, key=lambda x: x.count("S")*2 + x.count("D")*3))

for caesura in all_caesuras:
    df[caesura] = df[caesura].map({"True": True, "False": False})

for column in ["stressConflict", "metreConflict", "weightConflict", "syllables", "words"]:
    df[column] = df[column].astype(int)

In [7]:
len(df)

## How well does the analysis match the syllables?

In [8]:
pd.set_option("styler.format.precision", 0)
conflicts = df[["metreConflict", "stressConflict", "weightConflict"]].apply(pd.Series.value_counts)
conflicts.index = conflicts.index.rename("Konflikte")
conflicts = conflicts.rename({"metreConflict": "Markierung", "stressConflict": "Wortakzent", "weightConflict": "Silbengewicht"}, axis="columns")
write_table(conflicts, "conflicts", caption="Konflikte zwischen der Analyse und den metrischen Markierungen, dem Wortakzent und dem Silbengewicht")
pd.set_option("styler.format.precision", 2)
conflicts

In [9]:
print(df.metreConflict.astype(int).sum(), df.stressConflict.astype(int).sum(), df.weightConflict.astype(int).sum())


In [10]:
# df["weightConflict"].astype(int).value_counts(
df[df["weightConflict"] >= 5]

In [11]:
df[(df["stressConflict"] >= 3) & (df["weightConflict"] >= 3)]

## Amount of syllables

In [12]:
syllables_per_word = df["syllables"].sum() / df.words.sum()
syllables_per_word

In [13]:
syllables_per_verse = df["syllables"].mean()
print(df.syllables.std())
syllables_per_verse

In [14]:
# syll / words
syls_per_words = (df.groupby("book")["syllables"].sum() / df.groupby("book")["words"].sum()).rename("Silben / Wort")
average_sylls = df.groupby("book")["syllables"].mean().rename("μ Silben")
stddev_sylls = df.groupby("book")["syllables"].std().rename("σ Silben")
df_syllables = pd.concat([syls_per_words, average_sylls, stddev_sylls], axis=1)
df_syllables.index = df_syllables.index.rename("Buch")
write_table(df_syllables, "syllables", caption="Silben pro Wort und Silben pro Vers")
df_syllables

In [15]:
pd.DataFrame(df.groupby("book")["syllables"].value_counts()).unstack()

In [16]:
df_syllables["Silben / Wort"].std()

In [17]:
# syllable count histogram
ax = sns.histplot(y="syllables", data=df, hue="book", multiple="stack")
ax.set(ylabel='Anzahl Silben', xlabel='Anzahl Verse')
ax.legend(title="Buch", labels=[1,2,3,4])
ax.set_yticks(sorted(list(df["syllables"].drop_duplicates().values)))
write_figure("syllable-count-histogram")

In [18]:
df_syllables_count = pd.DataFrame(df["syllables"].rename("Anzahl Verse").value_counts())
df_syllables_count.index = df_syllables_count.index.rename("Silbenzahl")
df_syllables_count["Anteil Verse/\\%"] = 100 * df_syllables_count["Anzahl Verse"] / len(df)
write_table(df_syllables_count, "syllable-count", caption="Häufigkeit der einzelnen Silbenzahlen")
df_syllables_count

## Scansion

In [19]:
ax = sns.histplot(y="scansion", data=df, hue="book", multiple="stack")
ax.set(ylabel='Hexametertyp', xlabel='Anzahl Verse')
ax.legend(title="Buch", labels=[1,2,3,4])
write_figure("scansion-histogram")

In [20]:
pd.set_option("styler.format.precision", 2)
spondees_dactyls = pd.DataFrame()
spondees_dactyls["Buch"] = df["book"]
spondees_dactyls["S"] = df["scansion"].str.count("S")
spondees_dactyls["D"] = df["scansion"].str.count("D")
print(spondees_dactyls.sum(), spondees_dactyls.sum().D / (spondees_dactyls.sum().D + spondees_dactyls.sum().S))
df_s_d = spondees_dactyls.groupby("Buch").sum()
df_s_d["Anteil D/\\%"] = df_s_d["D"] / (df_s_d["S"] + df_s_d["D"])
write_table(df_s_d, "dactyls-spondees", caption="Daktylen vs. Spondeen")
df_s_d

In [21]:
df_scansion = pd.DataFrame(df["scansion"].rename("Anzahl Verse").value_counts())
df_scansion["Anteil Verse/\\%"] = 100 * df_scansion["Anzahl Verse"] / len(df)
df_scansion.index = df_scansion.index.rename("Hexametertyp")
write_table(df_scansion, "scansion-types", caption="Häufigkeit einzelner Hexameter-Typen")
df_scansion[0:10]

In [22]:
spondiaci = df[df["scansion"].str.endswith("SS")]
print(len(spondiaci), 100 * len(spondiaci) / len(df))

In [23]:
hexameter_types = df.scansion.drop_duplicates().values
print(hexameter_types)
print([htype for htype in hexameter_types if htype.endswith("SS")])

In [24]:
pd.crosstab(df['scansion'].rename("Hexametertyp"), df['syllables'].rename("Silbenzahl"))

In [25]:
strange_verses = df[df["scansion"].str.endswith("SSS")][["book", "verse", "text", "scansion"]]
write_table(strange_verses, "strange-verses")
strange_verses

## Caesurae

### Caesuras per book

In [26]:
caesura_rows = []
for index, row in df.iterrows():
    for caesura in all_caesuras:
        if row[caesura] == True:
            caesura_rows.append([row["book"], row["verse"], caesura])
caesura_df = pd.DataFrame(caesura_rows, columns=["book", "verse", "caesura"])
caesura_df["caesura"] = pd.Categorical(caesura_df["caesura"], all_caesuras)

In [27]:
caesura_per_book = pd.crosstab(caesura_df["book"].rename("Buch"), caesura_df["caesura"].rename("Einschnitt"))
write_table(caesura_per_book, "caesura-per-book", caption="Häufigkeit einzelner Verseinschnitte (pro Buch)")
caesura_per_book

In [28]:
ax = sns.histplot(x="caesura", data=caesura_df, hue="book", multiple="stack")
ax.set(xlabel='Einschnitt', ylabel='Anzahl Verse')
ax.legend(title="Buch", labels=[1,2,3,4])
write_figure("caesura-histogram")

In [29]:
reslut = []
for s in all_caesuras:
    count = df[s].value_counts()[True]
    reslut.append([count, 100 * count / len(df)])
caesura_count = pd.DataFrame(reslut, columns=["Anzahl Verse", "Anteil/\\%"])
caesura_count.index = all_caesuras
caesura_count = caesura_count.sort_values("Anzahl Verse", ascending=False)
caesura_count.index = caesura_count.index.rename("Einschnitt")
write_table(caesura_count, "caesura-count", caption="Häufigkeit einzelner Verseinschnitte")
caesura_count


### Caesura-less lines

In [30]:
caesuraless = df[~df["3h"] & ~df["5h"] & ~df["ktt"] & ~df["7h"] & ~df["pqt"] & ~df["bd"]]
write_table(caesuraless, "caesuraless")
caesuraless

### Caesura co-occurrences

In [31]:
from itertools import product
caesura_count = len(all_caesuras)
occurrences = np.zeros((caesura_count+1, caesura_count+1))
matrix = np.zeros((caesura_count))
for idx, row in df[all_caesuras].iterrows():
    for c1 in all_caesuras:
        if row[c1] and not any(row[c2] for c2 in all_caesuras if c2 != c1):
            i = all_caesuras.index(c1)
            matrix[i] += 1
    for c1, c2 in product(all_caesuras, all_caesuras):
        if c1 == c2:
            continue
        i1, i2 = all_caesuras.index(c1), all_caesuras.index(c2)
        if row[c1] and row[c2]:
            occurrences[i1, i2] += 1
occurrences[-1,:-1] = matrix
occurrences[:-1,-1] = matrix
occurrences = occurrences.astype(int)
df_occ = pd.DataFrame(occurrences, index=all_caesuras + ["keine"], columns=all_caesuras + ["keine"])

In [32]:
df_tril = df_occ.where(np.tril(np.ones(df_occ.shape), k=-1).astype(bool))
pd.set_option("styler.format.precision", 0)
write_table(df_tril, "cooccurrences", caption="Häufigkeit des gemeinsamen Auftretens von Verseinschnitten")
df_occ
#df_tril.style.format(precision=0)

In [33]:
df_copy = df
for caesura in all_caesuras:
    df_copy[caesura] = df_copy[caesura].map({True: "1", False: "0"})
df_caesura_combinations = pd.DataFrame(df_copy.groupby(all_caesuras).size().sort_values(ascending=False), columns=["Anzahl Verse"])
write_table(df_caesura_combinations, "caesura-combinations", caption="Häufigkeit der Kombinationen aus allen Verseinschnitten")
print(len(df_caesura_combinations))
df_caesura_combinations