# Explore Corpus of Czech Verse

In [1]:
%load_ext dotenv
%dotenv
import os

base_dir = os.getenv("WORKING_DIR")
os.chdir(base_dir)

In [2]:
from collections import defaultdict, Counter

In [3]:
from src.data_loader_and_saver import JSONDataLoaderAndSaver

data_loader = JSONDataLoaderAndSaver(base_dir, input_data_dir="src/data")

In [4]:
all_poems_data = data_loader.load_data("all_poems_data")

all_poems_data.json: loaded 66428 records.


In [5]:
all_poems_data[10512]

{'p_author': {'born': 1821,
  'died': 1856,
  'name': 'Havlíček Borovský, Karel',
  'identity': 'Havlíček Borovský, Karel'},
 'biblio': {'motto_aut': None,
  'b_subtitle': 'Jehly, špičky, sochory a kůly  stesal, zkoval, zostřil, sebral  k vůli  vojně s hloupostí a zlobou místo šavel  Borovský Havel.',
  'publisher': 'Dolenský, Antonín; Unie',
  'edition': '[1.]',
  'motto': None,
  'p_title': 'Pražské Vysoké Školy.',
  'place': 'Praha',
  'dedication': None,
  'b_title': 'Epigramy',
  'pages': '[80]',
  'year': '1921',
  'signature': 'ÚČL AV ČR; 52 VIII 2'},
 'book_id': '0176',
 'poem_id': '0001-0004-0000-0001-0000',
 'b_author': {'born': 1821,
  'died': 1856,
  'name': 'Havlíček Borovský, Karel',
  'identity': 'Havlíček Borovský, Karel'},
 'body': [[{'text': 'Dvě fakulty v Klementině,',
    'punct': {'4': ','},
    'words': [{'token_lc': 'dvě',
      'xsampa': 'dvjE',
      'morph': 'ClFP1-----------',
      'phoebe': 'dvje',
      'token': 'Dvě',
      'lemma': 'dva'},
     {'token_l

## General statistics

In [6]:
line_cnt = 0
word_cnt = 0

for poem in all_poems_data:
    for stanza in poem["body"]:
        for line in stanza:
            line_cnt += 1
            word_cnt += len(line["words"])

print(f"Poem cnt: {len(all_poems_data)}")
print(f"Line cnt: {line_cnt}")
print(f"Word cnt: {word_cnt}")

Poem cnt: 66428
Line cnt: 2310917
Word cnt: 12636867


## Poems and lines with recognized metres

In [7]:
recognized_poem_cnt = 0
recognized_line_cnt = 0

for poem in all_poems_data:
    poem_metres = set()
    poem_pos = set()

    for stanza in poem["body"]:
        for line in stanza:
            line_metres = {metre["type"] for metre in line["metre"]}
            line_pos = {pos for metre in line["metre"] for pos in metre["pattern"]}

            poem_metres.update(line_metres)
            poem_pos.update(line_pos)

            if "N" not in line_metres:
                assert ("X" not in line_pos)
                recognized_line_cnt += 1

    if "N" not in poem_metres:
        assert ("X" not in poem_pos)
        recognized_poem_cnt += 1

print(f"Poems with recognized metres cnt: {recognized_poem_cnt} ({recognized_poem_cnt / len(all_poems_data) * 100} %)")
print(f"Lines with recognized metres cnt: {recognized_line_cnt} ({recognized_line_cnt / line_cnt * 100} %)")

Poems with recognized metres cnt: 60458 (91.01282591678208 %)
Lines with recognized metres cnt: 2088508 (90.37572530731308 %)


## Multimetric lines

In [8]:
multimetric_lines = []

for poem in all_poems_data:
    for stanza in poem["body"]:
        for line in stanza:
            if len(line["metre"]) > 1:
                multimetric_lines.append((line["text"], line["metre"]))

print(f"Multimetric lines cnt: {len(multimetric_lines)} ({len(multimetric_lines) / line_cnt * 100} %)")

Multimetric lines cnt: 12182 (0.5271500447657791 %)


In [9]:
multimetric_lines[10:15]

[('rozum mi nutí vždycky stát',
  [{'foot': '4', 'clause': 'm', 'pattern': 'WSWSWSWS', 'type': 'J'},
   {'foot': '3', 'clause': 'a', 'pattern': 'SVWSWSVW', 'type': 'X'}]),
 ('šílený cit.',
  [{'foot': '2', 'clause': 'm', 'pattern': 'SVWS', 'type': 'X'},
   {'foot': '2', 'clause': 'm', 'pattern': 'WSWS', 'type': 'J'}]),
 ('Chmury se honí po nebi,',
  [{'foot': '3', 'clause': 'a', 'pattern': 'SVWSWSVW', 'type': 'X'},
   {'foot': '4', 'clause': 'm', 'pattern': 'WSWSWSWS', 'type': 'J'}]),
 ('do mlhy kraj se halí – – –',
  [{'foot': '3', 'clause': 'f', 'pattern': 'WSWSWSW', 'type': 'J'},
   {'foot': '3', 'clause': 'a', 'pattern': 'SVWSWSV', 'type': 'X'}]),
 ('Proč jen ty sny se vzbouzejí,',
  [{'foot': '4', 'clause': 'm', 'pattern': 'WSWSWSWS', 'type': 'J'},
   {'foot': '3', 'clause': 'a', 'pattern': 'SVWSWSVW', 'type': 'X'}])]

In [10]:
print(f"Metre counts in multimetric lines:")
Counter(len(line[1]) for line in multimetric_lines)

Metre counts in multimetric lines:


Counter({2: 11011, 3: 1171})

In [11]:
print(f"Metre combinations in multimetric lines:")
Counter(frozenset(metres["type"] for metres in line[1]) for line in multimetric_lines)

Metre combinations in multimetric lines:


Counter({frozenset({'J', 'X'}): 4801,
         frozenset({'D', 'J'}): 4940,
         frozenset({'T', 'X'}): 617,
         frozenset({'D', 'J', 'X'}): 1167,
         frozenset({'D', 'X'}): 552,
         frozenset({'X', 'hexameter'}): 44,
         frozenset({'D', 'T'}): 11,
         frozenset({'A', 'J'}): 26,
         frozenset({'Y'}): 20,
         frozenset({'D', 'J', 'T'}): 4})

In [12]:
# Annotation error

from pprint import pprint

for line in multimetric_lines:
    found = True

    for metre in line[1]:
        if metre["type"] != "Y":
            found = False
            break

    if found:
        pprint(line)
        print()

('NA poli, na roli,',
 [{'clause': 'm', 'foot': '3', 'pattern': 'WSWSWS', 'type': 'Y'},
  {'clause': 'm', 'foot': '3', 'pattern': 'WSWSWS', 'type': 'Y'}])

('na louce zelené,',
 [{'clause': 'm', 'foot': '3', 'pattern': 'WSWSWS', 'type': 'Y'},
  {'clause': 'm', 'foot': '3', 'pattern': 'WSWSWS', 'type': 'Y'}])

('co je tu klopoty,',
 [{'clause': 'm', 'foot': '3', 'pattern': 'WSWSWS', 'type': 'Y'},
  {'clause': 'm', 'foot': '3', 'pattern': 'WSWSWS', 'type': 'Y'}])

('srdéčko milené!',
 [{'clause': 'm', 'foot': '3', 'pattern': 'WSWSWS', 'type': 'Y'},
  {'clause': 'm', 'foot': '3', 'pattern': 'WSWSWS', 'type': 'Y'}])

('Od rána do noci,',
 [{'clause': 'm', 'foot': '3', 'pattern': 'WSWSWS', 'type': 'Y'},
  {'clause': 'm', 'foot': '3', 'pattern': 'WSWSWS', 'type': 'Y'}])

('od jara v závěje,',
 [{'clause': 'm', 'foot': '3', 'pattern': 'WSWSWS', 'type': 'Y'},
  {'clause': 'm', 'foot': '3', 'pattern': 'WSWSWS', 'type': 'Y'}])

('patří mi na čele',
 [{'clause': 'm', 'foot': '3', 'pattern': 'WSWS

## Polymetric poems

In [13]:
polymetric_poems_cnt = 0

for poem_idx, poem in enumerate(all_poems_data):
    poem_metres = []

    for stanza in poem["body"]:
        for line in stanza:
            line_metres = frozenset(metre["type"] for metre in line["metre"])
            poem_metres.append(line_metres)

    if len(frozenset(poem_metres)) > 1:
        polymetric_poems_cnt += 1

print(f"Polymetric poems cnt: {polymetric_poems_cnt} ({polymetric_poems_cnt / len(all_poems_data) * 100} %)")

Polymetric poems cnt: 2619 (3.9426145601252482 %)


## Poems metres

In [14]:
poems_metres = []
poem_idx_by_metres = defaultdict(list)

for poem_idx, poem in enumerate(all_poems_data):
    poem_metres = []

    for stanza in poem["body"]:
        for line in stanza:
            line_metres = "".join(sorted(list({metre["type"] for metre in line["metre"]})))
            poem_metres.append(line_metres)

    poems_metres.append(frozenset(poem_metres))
    poem_idx_by_metres[frozenset(poem_metres)].append(poem_idx)

poems_metres = Counter(poems_metres)

In [15]:
len(poems_metres)

109

In [16]:
poems_metres.most_common()

[(frozenset({'J'}), 31598),
 (frozenset({'T'}), 22136),
 (frozenset({'N'}), 5857),
 (frozenset({'D'}), 2076),
 (frozenset({'J', 'T'}), 936),
 (frozenset({'X'}), 895),
 (frozenset({'A'}), 425),
 (frozenset({'D', 'J'}), 366),
 (frozenset({'JX'}), 288),
 (frozenset({'DJ'}), 233),
 (frozenset({'D', 'X'}), 185),
 (frozenset({'D', 'T'}), 151),
 (frozenset({'hexameter', 'pentameter'}), 110),
 (frozenset({'hexameter'}), 106),
 (frozenset({'Y'}), 97),
 (frozenset({'A', 'J'}), 96),
 (frozenset({'D', 'JX'}), 81),
 (frozenset({'J', 'X'}), 67),
 (frozenset({'T', 'X'}), 48),
 (frozenset({'A', 'D'}), 48),
 (frozenset({'DJ', 'T'}), 46),
 (frozenset({'J', 'N'}), 44),
 (frozenset({'D', 'J', 'T'}), 42),
 (frozenset({'DJX'}), 42),
 (frozenset({'J', 'Y'}), 41),
 (frozenset({'N', 'T'}), 32),
 (frozenset({'A', 'Y'}), 28),
 (frozenset({'J', 'T', 'X'}), 26),
 (frozenset({'TX'}), 25),
 (frozenset({'J', 'N', 'T'}), 18),
 (frozenset({'D', 'J', 'X'}), 17),
 (frozenset({'D', 'T', 'X'}), 17),
 (frozenset({'DX'}), 16

In [17]:
all_poems_data[poem_idx_by_metres[frozenset({"DJX", "T"})][0]]

{'p_author': {'born': 1864,
  'died': 1942,
  'name': 'Machar, Josef Svatopluk',
  'identity': 'Machar, Josef Svatopluk'},
 'biblio': {'motto_aut': 'Machar, Josef Svatopluk',
  'b_subtitle': '1903',
  'publisher': 'Nová Krameriova expedice; Zápotočný, Josef B.',
  'edition': '[1.]',
  'motto': 'Dívčina podkasaná luk a toulec mi dává: Zkus trochu, velitel náš, tvůj otec Foibos, teď spí!',
  'p_title': 'EXTRAKT NAŠEHO DĚJEPISU.',
  'place': 'Rokycany',
  'dedication': None,
  'b_title': 'Satiricon',
  'pages': '92',
  'year': '1904',
  'signature': 'ÚČL AV ČR; 104 VIII 10'},
 'book_id': '0601',
 'poem_id': '0001-0000-0000-0002-0000',
 'b_author': {'born': 1864,
  'died': 1942,
  'name': 'Machar, Josef Svatopluk',
  'identity': 'Machar, Josef Svatopluk'},
 'body': [[{'text': 'Duše k smrti dřímá',
    'punct': {},
    'words': [{'token_lc': 'duše',
      'xsampa': 'duSE',
      'morph': 'NNFS1-----A-----',
      'phoebe': 'duSe',
      'token': 'Duše',
      'lemma': 'duše'},
     {'token_

## Metres
- T = trochej
- X = daktylotrochej
- J = jamb
- D = daktyl
- N = nerozpoznano
- Y = daktylotrochej s predrazkou
- A = amphibrach

In [18]:
metres = defaultdict(lambda: defaultdict(Counter))

for poem in all_poems_data:
    for stanza in poem["body"]:
        for line in stanza:
            for metre in line["metre"]:
                metres[metre["type"]][metre["pattern"]][(line["stress"], metre["clause"], metre["foot"])] += 1

In [19]:
metres.keys()

dict_keys(['T', 'J', 'X', 'N', 'A', 'D', 'pentameter', 'hexameter', 'Y'])

### Trochee - S1 W1 S2 W2 ... Sn (Wn)

In [20]:
list(metres["T"].keys())[:10]

['SWSWSWSWSWSWSW',
 'SWSWSWSW',
 'SWSWSW',
 'SWSWSWSWSW',
 'SWSWSWS',
 'SWSWSWSWSWSW',
 'SWSWSWSWSWSWSWSW',
 'SWSWSWSWS',
 'SWSWS',
 'SWSW']

In [45]:
print(list(metres["T"].keys())[0])
list(metres["T"].values())[0].most_common(20)

SWSWSWSWSWSWSW


[(('10101010101010', 'f', '7'), 438),
 (('10001010101010', 'f', '7'), 309),
 (('10101000101010', 'f', '7'), 304),
 (('10101010100010', 'f', '7'), 266),
 (('10101000100010', 'f', '7'), 236),
 (('10001010100010', 'f', '7'), 213),
 (('10001000101010', 'f', '7'), 198),
 (('10101010101000', 'f', '7'), 157),
 (('10001010101000', 'f', '7'), 155),
 (('10001000100010', 'f', '7'), 120),
 (('00101010101010', 'f', '7'), 119),
 (('10101000101000', 'f', '7'), 113),
 (('10001000101000', 'f', '7'), 109),
 (('00101010100010', 'f', '7'), 68),
 (('00101000101010', 'f', '7'), 62),
 (('10100010101010', 'f', '7'), 56),
 (('10100010100010', 'f', '7'), 52),
 (('10100010101000', 'f', '7'), 48),
 (('00101010101000', 'f', '7'), 43),
 (('00101000100010', 'f', '7'), 38)]

### Dactylotrochee - S1 (V1) W1 S2 (V2) W2 ... Sn ((Vn) Wn)

In [22]:
list(metres["X"].keys())[:10]

['SVWSWSVW',
 'SVWS',
 'SVWSWSVWS',
 'SVWSWSWSW',
 'SVWSW',
 'SVWSWSWSWS',
 'WSVWSVWSVWSW',
 'SVWSWSVWSW',
 'WSVWSWSVW',
 'SVWSVWSVWSW']

In [46]:
print(list(metres["X"].keys())[0])
list(metres["X"].values())[0].most_common(20)

SVWSWSVW


[(('10010100', 'a', '3'), 2787),
 (('10010101', 'a', '3'), 797),
 (('01010100', 'a', '3'), 784),
 (('00010100', 'a', '3'), 241),
 (('01010101', 'a', '3'), 221),
 (('10000100', 'a', '3'), 212),
 (('11010100', 'a', '3'), 206),
 (('10110100', 'a', '3'), 122),
 (('10100100', 'a', '3'), 62),
 (('01000100', 'a', '3'), 57),
 (('10000101', 'a', '3'), 50),
 (('10010001', 'a', '3'), 50),
 (('00010101', 'a', '3'), 41),
 (('11010101', 'a', '3'), 39),
 (('10110101', 'a', '3'), 31),
 (('10010010', 'a', '3'), 31),
 (('00110100', 'a', '3'), 27),
 (('10010000', 'a', '3'), 27),
 (('01010001', 'a', '3'), 25),
 (('01000101', 'a', '3'), 21)]

### Iamb - WS WS ... (W)

In [24]:
list(metres["J"].keys())[:10]

['WSWSWSWSWSW',
 'WSWSWSWSW',
 'WSWSWSWS',
 'WSWS',
 'WSWSWS',
 'WSWSWSWSWS',
 'WSWSWSW',
 'WSWSWSWSWSWSW',
 'WSWSWSWSWSWS',
 'WSVWSWSWS']

In [47]:
print(list(metres["J"].keys())[0])
list(metres["J"].values())[0].most_common(20)

WSWSWSWSWSW


[(('01010101010', 'f', '5'), 54757),
 (('01000101010', 'f', '5'), 39280),
 (('01010100010', 'f', '5'), 36870),
 (('01010101000', 'f', '5'), 23639),
 (('01010001010', 'f', '5'), 22747),
 (('10010101010', 'f', '5'), 21835),
 (('01000100010', 'f', '5'), 21833),
 (('10010100010', 'f', '5'), 17430),
 (('01000101000', 'f', '5'), 17161),
 (('10010101000', 'f', '5'), 12542),
 (('11010101010', 'f', '5'), 11162),
 (('11000101010', 'f', '5'), 8807),
 (('01010001000', 'f', '5'), 7536),
 (('11010100010', 'f', '5'), 7508),
 (('10010001010', 'f', '5'), 6631),
 (('11000100010', 'f', '5'), 4941),
 (('11010101000', 'f', '5'), 4846),
 (('11010001010', 'f', '5'), 4501),
 (('11000101000', 'f', '5'), 3765),
 (('01001101010', 'f', '5'), 3439)]

### Dactyl - S1 V1 W1 S2 V2 W2 ... Sn ((Vn) Wn)

In [28]:
list(metres["D"].keys())[:10]

['SVWSVWSVW',
 'SVWSV',
 'SVWSVWSVWSVW',
 'SVWSVWSVWSV',
 'SVWSVWS',
 'SVWS',
 'SVWSWSVWSV',
 'SVWSVWSV',
 'SVWSWSVW',
 'SVWSVWSWSV']

In [48]:
print(list(metres["D"].keys())[0])
list(metres["D"].values())[0].most_common(20)

SVWSVWSVW


[(('100100100', 'a', '3'), 3539),
 (('010100100', 'a', '3'), 799),
 (('100100101', 'a', '3'), 631),
 (('110100100', 'a', '3'), 260),
 (('000100100', 'a', '3'), 246),
 (('101100100', 'a', '3'), 218),
 (('100101100', 'a', '3'), 211),
 (('010010100', 'a', '3'), 190),
 (('010100101', 'a', '3'), 155),
 (('100010100', 'a', '3'), 124),
 (('100110100', 'a', '3'), 76),
 (('100101001', 'a', '3'), 58),
 (('100000100', 'a', '3'), 58),
 (('010000100', 'a', '3'), 58),
 (('100101101', 'a', '3'), 47),
 (('010010101', 'a', '3'), 42),
 (('110010100', 'a', '3'), 42),
 (('110100101', 'a', '3'), 42),
 (('010110100', 'a', '3'), 38),
 (('010101100', 'a', '3'), 38)]

### Not recognized

In [102]:
list(metres["N"].keys())[:10]

['XXXXXXXXX',
 'XXXXXXXXXXXXX',
 'XXXXX',
 'XXXXXXXXXXX',
 'XXXXXXXXXX',
 'XXXXXXXXXXXX',
 'XXXXXXXXXXXXXXX',
 'XXXXXXXXXXXXXXXXX',
 'XXXXXXXX',
 'XXXXXX']

In [49]:
print(list(metres["N"].keys())[0])
list(metres["N"].values())[0].most_common(20)

XXXXXXXXX


[(('100100100', 'a', ''), 2334),
 (('010010010', 'a', ''), 1735),
 (('100101010', 'a', ''), 1560),
 (('100101000', 'a', ''), 1455),
 (('010010100', 'a', ''), 1032),
 (('100100010', 'a', ''), 791),
 (('101010100', 'a', ''), 749),
 (('010100100', 'a', ''), 706),
 (('100010100', 'a', ''), 674),
 (('100101001', 'a', ''), 646),
 (('010101010', 'a', ''), 535),
 (('100100101', 'a', ''), 494),
 (('100010010', 'a', ''), 471),
 (('101010010', 'a', ''), 431),
 (('010101000', 'a', ''), 429),
 (('101000100', 'a', ''), 389),
 (('101001010', 'a', ''), 365),
 (('010100010', 'a', ''), 348),
 (('101001000', 'a', ''), 344),
 (('010001010', 'a', ''), 333)]

### Dactylotrochee with anacrusis - W0 S1 (V1) W1 S2 (V2) W2 ... Sn ((Vn) Wn)

In [103]:
list(metres["Y"].keys())[:10]

['WSVWSVWSWSW',
 'WSVWSVWSVWSW',
 'WSVWSWSWSW',
 'WSVWSWSWS',
 'WSVWSWSVW',
 'WSWSWSVWSVW',
 'WSVWSWSV',
 'WSVWSV',
 'WSVWSWSVWS',
 'WSVWSVWSVW']

In [50]:
print(list(metres["Y"].keys())[0])
list(metres["Y"].values())[0].most_common(20)

WSVWSVWSWSW


[(('01001001010', 'f', '4'), 76),
 (('01001001000', 'f', '4'), 22),
 (('11001001010', 'f', '4'), 20),
 (('11001001000', 'f', '4'), 6),
 (('01001011010', 'f', '4'), 4),
 (('01001010010', 'f', '4'), 3),
 (('01001101010', 'f', '4'), 2),
 (('01001000010', 'f', '4'), 2),
 (('11101001010', 'f', '4'), 2),
 (('01011001010', 'f', '4'), 2),
 (('11001010010', 'f', '4'), 1),
 (('11011001010', 'f', '4'), 1),
 (('01101001010', 'f', '4'), 1),
 (('01000001010', 'f', '4'), 1),
 (('11011000010', 'f', '4'), 1)]

### Dactyl with anacrusis (Amphibrach) - W0 S1 V1 W1 S2 V2 W2 ... Sn ((Vn) Wn)

In [104]:
list(metres["A"].keys())[:10]

['WSVWSVWSV',
 'WSVWSV',
 'SVWSVWSVW',
 'WSVWSVWS',
 'SVWSVWSVWS',
 'WSVWSVWSVWS',
 'WSVWSWSV',
 'WSVWSVWSVW',
 'WSVWSWSVWS',
 'WSVWSVW']

In [51]:
print(list(metres["A"].keys())[0])
list(metres["A"].values())[0].most_common(20)

WSVWSVWSV


[(('010010010', 'a', '3'), 2492),
 (('110010010', 'a', '3'), 509),
 (('010110010', 'a', '3'), 140),
 (('100010010', 'a', '3'), 127),
 (('010010110', 'a', '3'), 119),
 (('011010010', 'a', '3'), 40),
 (('110110010', 'a', '3'), 35),
 (('010001010', 'a', '3'), 33),
 (('010100010', 'a', '3'), 32),
 (('010010000', 'a', '3'), 31),
 (('101010010', 'a', '3'), 30),
 (('010011010', 'a', '3'), 29),
 (('010000010', 'a', '3'), 25),
 (('010010100', 'a', '3'), 20),
 (('001010010', 'a', '3'), 18),
 (('110010110', 'a', '3'), 15),
 (('000010010', 'a', '3'), 14),
 (('010101010', 'a', '3'), 11),
 (('010110110', 'a', '3'), 10),
 (('110001010', 'a', '3'), 9)]

### Hexametre

In [95]:
len(metres["hexameter"])

93

In [105]:
list(metres["hexameter"].keys())[:10]

['SVWSVWSVWSVWSVWSW',
 'SVWSVWSWSVWSVWSW',
 'SWSWSWSWSVWSV',
 'SWSWSVWSVWSVWSW',
 'SVWSWSWSWSVWSW',
 'SVWSWSWSVWSVWSW',
 'SWSWSVWSWSVWSW',
 'SWSVWSWSVWSVWSV',
 'SVWSWSVWSWSVWSV',
 'SVWSVWSWSWSVWSW']

In [52]:
print(list(metres["hexameter"].keys())[0])
list(metres["hexameter"].values())[0].most_common(20)

SVWSVWSVWSVWSVWSW


[(('10010010010010010', 'f', '6'), 199),
 (('10010001010010010', 'f', '6'), 53),
 (('10010011010010010', 'f', '6'), 52),
 (('10010010110010010', 'f', '6'), 10),
 (('10010000010010010', 'f', '6'), 8),
 (('01010010010010010', 'f', '6'), 7),
 (('10010010011010010', 'f', '6'), 6),
 (('10010100010010010', 'f', '6'), 6),
 (('10011010010010010', 'f', '6'), 6),
 (('10010010100010010', 'f', '6'), 5),
 (('10110010010010010', 'f', '6'), 5),
 (('10010010010110010', 'f', '6'), 5),
 (('10010010001010010', 'f', '6'), 4),
 (('00010010010010010', 'f', '6'), 4),
 (('10011011010010010', 'f', '6'), 4),
 (('10010110010010010', 'f', '6'), 4),
 (('10001010010010010', 'f', '6'), 3),
 (('10010001010010000', 'f', '6'), 3),
 (('10000010010010010', 'f', '6'), 3),
 (('10010001011010010', 'f', '6'), 3)]

In [134]:
len(poem_idx_by_metres[frozenset({"hexameter"})])

106

In [135]:
all_poems_data[poem_idx_by_metres[frozenset({"hexameter"})][3]]

{'p_author': {'born': 1769,
  'died': 1847,
  'name': 'Hek, František Vladislav',
  'identity': 'Hek, František Vladislav'},
 'biblio': {'motto_aut': None,
  'b_subtitle': None,
  'publisher': 'Novočeská knihovna; Jakubec, Jan; Česká akademie věd a umění; Leschinger, Edvard',
  'edition': '[1.]',
  'motto': None,
  'p_title': 'Komu platí přízvuk.',
  'place': 'Praha',
  'dedication': None,
  'b_title': 'Práce veršované (in Sebrané spisy Františka Vladislava Heka, svazek 1)',
  'pages': 'XXVIII+148',
  'year': '1917',
  'signature': 'ÚČL AV ČR; 488 VIII 1/I'},
 'book_id': '0180',
 'poem_id': '0001-0001-0000-0069-0000',
 'b_author': {'born': 1769,
  'died': 1847,
  'name': 'Hek, František Vladislav',
  'identity': 'Hek, František Vladislav'},
 'body': [[{'text': 'Statečný Agamemnon jak rychle se do českých básní',
    'punct': {},
    'words': [{'token_lc': 'statečný',
      'xsampa': 'statEt_Sni:',
      'morph': 'AAMS1----1A-----',
      'phoebe': 'stateCnI',
      'token': 'Statečný',

### Pentametre

In [109]:
list(metres["pentameter"].keys())[:10]

['SVWSVWSWSWSVWS',
 'SVWSWSWSVWSVWS',
 'SVWSVWSSVWSVWS',
 'SVWSWSSVWSWS',
 'SWSVWSSVWSVWS',
 'SVWSWSSVWSVWS',
 'SWSWSSVWSVWS',
 'SVWSVWSWSVWSWS',
 'SVWSVWSVWSVWSWS',
 'SVWSVWSWSVWSVWS']

In [53]:
print(list(metres["pentameter"].keys())[0])
list(metres["pentameter"].values())[0].most_common(20)

SVWSVWSWSWSVWS


[(('10010010101001', 'm', '6'), 1)]

In [111]:
len(poem_idx_by_metres[frozenset({"pentameter"})])

9

In [136]:
all_poems_data[poem_idx_by_metres[frozenset({"pentameter"})][0]]

{'p_author': {'born': 1853,
  'died': 1912,
  'name': 'Vrchlický, Jaroslav',
  'identity': 'Vrchlický, Jaroslav'},
 'biblio': {'motto_aut': None,
  'b_subtitle': '1898-1899',
  'publisher': 'Souborné vydání básnických spisů Jaroslava Vrchlického; Otto, Jan',
  'edition': '[1.]',
  'motto': None,
  'p_title': 'VIII. Nicotin.',
  'place': 'Praha',
  'dedication': None,
  'b_title': 'Překročen zenit...',
  'pages': '208',
  'year': '1899',
  'signature': 'ÚČL AV ČR; 212 VIII 140'},
 'book_id': '1059',
 'poem_id': '0001-0006-0000-0008-0000',
 'b_author': {'born': 1853,
  'died': 1912,
  'name': 'Vrchlický, Jaroslav',
  'identity': 'Vrchlický, Jaroslav'},
 'body': [[{'text': 'Moderních čivů ty’s v hnědé kápi zpovědník, vím,',
    'punct': {'8': ',', '7': ','},
    'words': [{'token_lc': 'moderních',
      'xsampa': 'modErJi:x',
      'morph': 'AAFP2----1A-----',
      'phoebe': 'moderNIx',
      'token': 'Moderních',
      'lemma': 'moderní'},
     {'token_lc': 'čivů',
      'xsampa': 't_SI

## Clause

In [37]:
clauses = defaultdict(Counter)

for poem in all_poems_data:
    for stanza in poem["body"]:
        for line in stanza:
            for metre in line["metre"]:
                clauses[metre["clause"]][metre["pattern"]] += 1

### Masculine

In [42]:
clauses["m"].most_common(20)

[('WSWSWSWSWS', 228126),
 ('WSWSWSWS', 164591),
 ('SWSWSWS', 97379),
 ('SWSWSWSWS', 68108),
 ('WSWSWS', 42241),
 ('WSWSWSWSWSWS', 36505),
 ('SWSWSWSWSWS', 18662),
 ('SWSWS', 18637),
 ('SVWSVWSVWS', 8195),
 ('WSWS', 7815),
 ('SWSWSWSWSWSWSWS', 4807),
 ('SWS', 4011),
 ('WSWSWSWWSWSWS', 3894),
 ('SVWSVWS', 3320),
 ('SVWSWSVWS', 3170),
 ('WSWSWSWSWSWSWS', 3056),
 ('WSVWSVWSVWS', 2223),
 ('WSWSWSWSWSWSWSWS', 1857),
 ('SWSWSWSWSWSWS', 1736),
 ('SVWS', 1561)]

In [39]:
for pattern in clauses["m"]:
    assert (pattern[-1] == "S")

### Feminine

In [43]:
clauses["f"].most_common(20)

[('WSWSWSWSWSW', 401928),
 ('SWSWSWSW', 270774),
 ('SWSWSWSWSW', 202093),
 ('WSWSWSWSW', 102829),
 ('SWSWSW', 69679),
 ('WSWSWSWSWSWSW', 57528),
 ('WSWSWSW', 57043),
 ('SWSWSWSWSWSW', 53300),
 ('WSWSW', 14964),
 ('SVWSWSVWSW', 10285),
 ('SWSW', 9525),
 ('SWSWSWSWSWSWSWSW', 6084),
 ('WSWSWSWSWSWSWSW', 5483),
 ('SWSWSWSWSWSWSW', 4348),
 ('SVWSW', 3179),
 ('SWSWSVWSWSW', 2550),
 ('SWSWSWSSWSWSW', 1406),
 ('WSW', 1400),
 ('WSWSWSWWSWSWSW', 1214),
 ('SVWSVWSWSW', 1014)]

In [41]:
for pattern in clauses["f"]:
    assert (pattern[-2:] == "SW")

### Acatalectic

In [44]:
clauses["a"].most_common(20)

[('XXXXXXXX', 31888),
 ('XXXXXXXXXX', 29919),
 ('XXXXXXXXX', 23832),
 ('XXXXXXXXXXX', 20240),
 ('SVWSVWSV', 16896),
 ('XXXXXXXXXXXX', 15136),
 ('XXXXXXXXXXXXX', 15086),
 ('XXXXXXX', 14455),
 ('XXXXXXXXXXXXXX', 12791),
 ('XXXXXXXXXXXXXXX', 11992),
 ('SVWSVWSVWSV', 11790),
 ('XXXXXX', 11674),
 ('XXXXXXXXXXXXXXXX', 10417),
 ('SVWSWSVW', 7833),
 ('SVWSVWSVW', 7783),
 ('XXXXX', 7386),
 ('SVWSVW', 6862),
 ('SVWSV', 5155),
 ('XXXXXXXXXXXXXXXXX', 4827),
 ('WSVWSVWSV', 4665)]

In [119]:
for pattern in clauses["a"]:
    if not (pattern[-1] == "X" or pattern[-2:] == "SV" or pattern[-3:] == "SVW"):
        print(f"Error: {pattern}")

Error: W


## Ghazal poems

In [138]:
ghazal_idxs = set()

for idx, poem in enumerate(all_poems_data):
    if any(pos == "-" for stanza in poem["body"] for line in stanza for metre in line["metre"] for pos in metre["pattern"]):
        ghazal_idxs.add(idx)

ghazal_idxs = list(ghazal_idxs)

In [139]:
len(ghazal_idxs)

93

In [140]:
all_poems_data[ghazal_idxs[5]]

{'p_author': {'born': 1853,
  'died': 1912,
  'name': 'Vrchlický, Jaroslav',
  'identity': 'Vrchlický, Jaroslav'},
 'biblio': {'motto_aut': None,
  'b_subtitle': 'Básně, 1894-1897',
  'publisher': 'Nové souborné vydání básnických spisů Jaroslava Vrchlického; Voborník, Jan; Otto, Jan; Průmyslová tiskárna v Praze; Unie',
  'edition': '[1.]',
  'motto': None,
  'p_title': 'XXVIII. Řekni sama, nebyli jsme děti včera',
  'place': 'Praha',
  'dedication': None,
  'b_title': 'Kvítí Perdity',
  'pages': '160',
  'year': '1930',
  'signature': 'ÚČL AV ČR; 212 VIII 125'},
 'book_id': '1032',
 'poem_id': '0001-0007-0000-0029-0000',
 'b_author': {'born': 1853,
  'died': 1912,
  'name': 'Vrchlický, Jaroslav',
  'identity': 'Vrchlický, Jaroslav'},
 'body': [[{'text': 'Řekni sama, nebyli jsme děti včera ',
    'punct': {'2': ','},
    'words': [{'token_lc': 'řekni',
      'xsampa': 'P\\EkJI',
      'morph': 'Vi-S---2--A----P',
      'phoebe': 'RekNi',
      'token': 'Řekni',
      'lemma': 'říci'},
 