# Create datasets

In [1]:
%load_ext dotenv
%dotenv
import os

base_dir = os.getenv("WORKING_DIR")
os.chdir(base_dir)

In [2]:
# Random seed
SEED = 88

# Splitting data ratios
TRAIN_RATIO = 0.7
DEV_RATIO = 0.15
TEST_RATIO = 0.15

# Corpus of Czech Verse directory path
CCV_DATA_PATH = "resources/corpusCzechVerse/ccv"

In [3]:
import glob
import json

In [4]:
from src.data_loader_and_saver import JSONDataLoaderAndSaver

data_loader_and_saver = JSONDataLoaderAndSaver(base_dir, input_data_dir="src/data", output_data_dir="src/data")

In [15]:
from src.kveta.sampa_syllable_parser import SampaSyllableParser
from src.util import Util


def contains_err(poem_data: dict) -> bool:
    """
    Test if annotated poem data contain annotation error.
    Whether number of X-SAMPA syllables and number of positions in the metrical pattern is not equal.
    :param poem_data: Data of one poem
    :return: Poem data contain annotation error
    """
    sampa_parser = SampaSyllableParser()

    for stanza_data in poem_data["body"]:
        for line_data in stanza_data:
            num_syllables = sum(sampa_parser.get_syllable_cnt(word["xsampa"]) for word in line_data["words"])

            for metre in line_data["metre"]:
                if len(Util.normalize_metrical_pattern(metre["pattern"])) != num_syllables:
                    print(f"Annotation error, Pattern {metre['pattern']}, Num syllables: {num_syllables}, Text: {line_data['text']}")
                    return True

    return False

In [6]:
import random


def shuffle_data(data: list, seed: int = SEED) -> list:
    """
    Shuffle data.
    :param data: Data to shuffle
    :param seed: Random seed
    :return: Shuffled data
    """
    random.seed(seed)
    random.shuffle(data)

    return data

In [7]:
from sklearn.model_selection import train_test_split
from typing import Tuple


def split_train_dev_test(data: list, train_ratio: float = TRAIN_RATIO, dev_ratio: float = DEV_RATIO, test_ratio: float = TEST_RATIO, seed: int = SEED) -> Tuple[
    list]:
    """
    Shuffle data and split it into train, dev and test.
    :param data: Data to split
    :param train_ratio: Ratio of train data
    :param dev_ratio: Ratio of dev data
    :param test_ratio: Ratio of test data
    :param seed: Random seed
    :return: Data splits
    """
    train_data, dev_data = train_test_split(data, train_size=train_ratio, random_state=seed)
    dev_data, test_data = train_test_split(dev_data, train_size=dev_ratio / (dev_ratio + test_ratio), random_state=seed)

    print(f"Train data cnt: {len(train_data)}, Dev data cnt: {len(dev_data)}, Test data cnt: {len(test_data)}")

    return train_data, dev_data, test_data

In [8]:
def get_X(poems_data: list) -> list:
    """
    Create X dataset from poems data.
    :param poems_data: Poems data
    :return: X dataset from poems data
    """
    return [[[{
        "token_lc": word["token_lc"],
        "xsampa": word["xsampa"],
        "morph": word["morph"],
        "lemma": word["lemma"],
        "author": poem["p_author"],
        "year": poem["biblio"]["year"]
    } for word in line["words"]] for stanza in poem["body"] for line in stanza] for poem in poems_data]

In [9]:
def get_y(poems_data: list, one_metre: bool = False) -> list:
    """
    Create y dataset from poems data.
    :param poems_data: Poems data
    :param one_metre: Whether to include all the metres or only the first
    :return: y dataset from poems data
    """
    if one_metre:
        return [[line["metre"][0] for stanza in poem["body"] for line in stanza] for poem in poems_data]
    else:
        return [[line['metre'] for stanza in poem['body'] for line in stanza] for poem in poems_data]

In [10]:
def get_tokens(poems_data: list) -> list:
    """
    Extract all lowercase tokens from poems data.
    :param poems_data: Poems data
    :return: Extracted lowercase tokens
    """
    return [[[word["token_lc"] for word in line["words"]] for stanza in poem["body"] for line in stanza] for poem in poems_data]

In [11]:
def get_sampa_tokens(poems_data: list) -> list:
    """
    Extract all X-SAMPA tokens from poems data.
    :param poems_data: Poems data
    :return: Extracted X-SAMPA tokens
    """
    return [[[word["xsampa"] for word in line["words"]] for stanza in poem["body"] for line in stanza] for poem in poems_data]

## All poems

In [12]:
# Load all data from the Corpus of Czech Verse
all_poems_data = []

for file in glob.glob(os.path.join(CCV_DATA_PATH, "*.json")):
    with open(file, "r") as f:
        data = json.load(f)
        all_poems_data += data

In [13]:
len(all_poems_data)

66428

In [14]:
all_poems_data[0]

{'p_author': {'born': 1875,
  'died': 1947,
  'name': 'Bogner, Adolf',
  'identity': 'Bogner, Adolf'},
 'biblio': {'motto_aut': None,
  'b_subtitle': 'Verše',
  'publisher': 'Bečvan, Jiří; Zapletal, Prokop',
  'edition': '[1.]',
  'motto': None,
  'p_title': 'Nuže, bratří, zapějte mi píseň,',
  'place': 'Hranice',
  'dedication': None,
  'b_title': 'Dělník zpívá...',
  'pages': '47',
  'year': '1898',
  'signature': 'Národní knihovna ČR, Praha; 54 K 9170'},
 'book_id': '0025',
 'poem_id': '0001-0000-0000-0001-0000',
 'b_author': {'born': 1875,
  'died': 1947,
  'name': 'Bogner, Adolf',
  'identity': 'Bogner, Adolf'},
 'body': [[{'text': 'Nuže, bratří, zapějte mi píseň, ',
    'punct': {'5': ',', '2': ',', '1': ','},
    'words': [{'token_lc': 'nuže',
      'xsampa': 'nuZE',
      'morph': 'TT--------------',
      'phoebe': 'nuZe',
      'token': 'Nuže',
      'lemma': 'nuže'},
     {'token_lc': 'bratří',
      'xsampa': 'bratP\\i:',
      'morph': 'NNMP1-----A---2-',
      'phoebe': '

In [15]:
all_poems_data = shuffle_data(all_poems_data)

In [16]:
all_poems_data[0]

{'p_author': {'born': 1853,
  'died': 1912,
  'name': 'Vrchlický, Jaroslav',
  'identity': 'Vrchlický, Jaroslav'},
 'biblio': {'motto_aut': None,
  'b_subtitle': 'Padesát obrazů a kreseb Ferdinanda Engelmüllera',
  'publisher': 'Vrchlický, Jaroslav; Leschinger, Edvard',
  'edition': '[1.]',
  'motto': None,
  'p_title': 'PODZIM V PARKU',
  'place': 'Praha',
  'dedication': None,
  'b_title': 'Nálady a pohádky',
  'pages': '107',
  'year': '1902',
  'signature': 'Národní knihovna ČR, Praha; 54 A 683'},
 'book_id': '1048',
 'poem_id': '0001-0001-0001-0003-0000',
 'b_author': {'born': 1853,
  'died': 1912,
  'name': 'Vrchlický, Jaroslav',
  'identity': 'Vrchlický, Jaroslav'},
 'body': [[{'text': 'Chmurná jeseň šerým stínem v starý park se vkrádá,',
    'punct': {'9': ','},
    'words': [{'token_lc': 'chmurná',
      'xsampa': 'xmurna:',
      'morph': 'AAFS1----1A-----',
      'phoebe': 'xmurnA',
      'token': 'Chmurná',
      'lemma': 'chmurný'},
     {'token_lc': 'jeseň',
      'xsampa

In [17]:
data_loader_and_saver.save_data(all_poems_data, "all_poems_data")

Data saved to all_poems_data.json


In [18]:
all_poems_data_X = get_X(all_poems_data)

In [19]:
data_loader_and_saver.save_data(all_poems_data_X, "all_poems_data_X")

Data saved to all_poems_data_X.json


In [20]:
all_poems_data_y = get_y(all_poems_data)

In [21]:
data_loader_and_saver.save_data(all_poems_data_y, "all_poems_data_y")

Data saved to all_poems_data_y.json


## Poems with all metres recognized and without annotation errors

In [16]:
all_poems_data_all_metres_recognized_without_err = []

# For all data from the Corpus of Czech Verse
for file in glob.glob(os.path.join(CCV_DATA_PATH, "*.json")):
    with open(file, "r") as f:
        data = json.load(f)

        for poem in data:
            # Include only poems without annotation errors and with no unknown metres
            if not contains_err(poem) and not any(metre["type"] == "N" for stanza in poem["body"] for line in stanza for metre in line["metre"]):
                all_poems_data_all_metres_recognized_without_err.append(poem)

Annotation error, Pattern WSWSWSWSWSW, Num syllables: 10, Text: Tak jako vždy. Lui, Monsieur et Madame.
Annotation error, Pattern WSWSWSWSWSW, Num syllables: 10, Text: Tak jako vždy. Lui, Monsieur et Madame.
Annotation error, Pattern WSWSWSWSWS, Num syllables: 9, Text: A teď ztich’ její šum, už nelze pít,


In [13]:
len(all_poems_data_all_metres_recognized_without_err)

60455

### Compare metrical positions
- '-': radif position
- 'X': unknown position

In [14]:
from collections import Counter


def get_all_metrical_positions(poems_data: list) -> Counter:
    """
    Return counter of all metrical positions inside poems data.
    :param poems_data: Poems data
    :return: Counter of all metrical positions
    """
    return Counter((c for poem in poems_data for stanza in poem['body'] for line in stanza for metre in line['metre'] for c in metre['pattern']))

In [24]:
metrical_positions_all_poems = get_all_metrical_positions(all_poems_data)
metrical_positions_all_poems

Counter({'S': 9417830, 'W': 9796919, 'V': 353176, 'X': 2340974, '-': 653})

In [16]:
metrical_positions_all_metres_recognized_without_err = get_all_metrical_positions(all_poems_data_all_metres_recognized_without_err)
metrical_positions_all_metres_recognized_without_err

Counter({'S': 9320946, 'W': 9700539, 'V': 351895, '-': 615})

## All poems just 1 metre, no unknown metres

In [17]:
extension = "_one_metre_all_metres_recognized"

In [18]:
poems_data_one_metre_all_metres_recognized = []

for poem in all_poems_data_all_metres_recognized_without_err:
    poem_metres = {metre['type'] for stanza in poem['body'] for line in stanza for metre in line['metre']}

    # Only add poems with just one metre assigned to all its lines
    if len(poem_metres) == 1:
        poems_data_one_metre_all_metres_recognized.append(poem)

In [19]:
len(poems_data_one_metre_all_metres_recognized)

57339

In [20]:
poems_data_one_metre_all_metres_recognized[0]

{'p_author': {'born': 1875,
  'died': 1947,
  'name': 'Bogner, Adolf',
  'identity': 'Bogner, Adolf'},
 'biblio': {'motto_aut': None,
  'b_subtitle': 'Verše',
  'publisher': 'Bečvan, Jiří; Zapletal, Prokop',
  'edition': '[1.]',
  'motto': None,
  'p_title': 'TUŽBA.',
  'place': 'Hranice',
  'dedication': None,
  'b_title': 'Dělník zpívá...',
  'pages': '47',
  'year': '1898',
  'signature': 'Národní knihovna ČR, Praha; 54 K 9170'},
 'book_id': '0025',
 'poem_id': '0001-0000-0000-0002-0000',
 'b_author': {'born': 1875,
  'died': 1947,
  'name': 'Bogner, Adolf',
  'identity': 'Bogner, Adolf'},
 'body': [[{'text': 'Když k práci chvátám z rána,',
    'punct': {'6': ','},
    'words': [{'token_lc': 'když',
      'xsampa': 'gdIS',
      'morph': 'J,--------------',
      'phoebe': 'gdiS',
      'token': 'Když',
      'lemma': 'když'},
     {'token_lc': 'k',
      'xsampa': 'k',
      'morph': 'RR--3-----------',
      'phoebe': 'k',
      'token': 'k',
      'lemma': 'k'},
     {'token_lc':

In [21]:
train_data, dev_data, test_data = split_train_dev_test(poems_data_one_metre_all_metres_recognized)

Train data cnt: 40137, Dev data cnt: 8601, Test data cnt: 8601


In [24]:
data_loader_and_saver.save_all_data(train_data, dev_data, test_data, extension)

Data saved to train_one_metre_all_metres_recognized.json
Data saved to dev_one_metre_all_metres_recognized.json
Data saved to test_one_metre_all_metres_recognized.json


In [25]:
train_X = get_X(train_data)
dev_X = get_X(dev_data)
test_X = get_X(test_data)

In [26]:
data_loader_and_saver.save_all_data(train_X, dev_X, test_X, f"_X{extension}")

Data saved to train_X_one_metre_all_metres_recognized.json
Data saved to dev_X_one_metre_all_metres_recognized.json
Data saved to test_X_one_metre_all_metres_recognized.json


In [27]:
train_y = get_y(train_data, one_metre=True)
dev_y = get_y(dev_data, one_metre=True)
test_y = get_y(test_data, one_metre=True)

In [28]:
data_loader_and_saver.save_all_data(train_y, dev_y, test_y, f"_y{extension}")

Data saved to train_y_one_metre_all_metres_recognized.json
Data saved to dev_y_one_metre_all_metres_recognized.json
Data saved to test_y_one_metre_all_metres_recognized.json


In [29]:
train_tokens = get_tokens(train_data)

In [30]:
data_loader_and_saver.save_data(train_tokens, f"train_tokens{extension}")

Data saved to train_tokens_one_metre_all_metres_recognized.json


In [31]:
train_sampa_tokens = get_sampa_tokens(train_data)

In [32]:
data_loader_and_saver.save_data(train_sampa_tokens, f"train_sampa_tokens{extension}")

Data saved to train_sampa_tokens_one_metre_all_metres_recognized.json


## All lines just 1 metre, no unknown metres

In [14]:
extension = "_one_metre_line_all_metres_recognized"

In [15]:
poems_data_one_metre_line_all_metres_recognized = []

for poem in all_poems_data_all_metres_recognized_without_err:
    # Only add poems that contain no lines with more metres assigned
    if all(len({metre['type'] for metre in line['metre']}) == 1 for stanza in poem['body'] for line in stanza):
        poems_data_one_metre_line_all_metres_recognized.append(poem)

In [16]:
len(poems_data_one_metre_line_all_metres_recognized)

59661

In [17]:
poems_data_one_metre_line_all_metres_recognized[0]

{'p_author': {'born': 1875,
  'died': 1947,
  'name': 'Bogner, Adolf',
  'identity': 'Bogner, Adolf'},
 'biblio': {'motto_aut': None,
  'b_subtitle': 'Verše',
  'publisher': 'Bečvan, Jiří; Zapletal, Prokop',
  'edition': '[1.]',
  'motto': None,
  'p_title': 'Nuže, bratří, zapějte mi píseň,',
  'place': 'Hranice',
  'dedication': None,
  'b_title': 'Dělník zpívá...',
  'pages': '47',
  'year': '1898',
  'signature': 'Národní knihovna ČR, Praha; 54 K 9170'},
 'book_id': '0025',
 'poem_id': '0001-0000-0000-0001-0000',
 'b_author': {'born': 1875,
  'died': 1947,
  'name': 'Bogner, Adolf',
  'identity': 'Bogner, Adolf'},
 'body': [[{'text': 'Nuže, bratří, zapějte mi píseň, ',
    'punct': {'5': ',', '2': ',', '1': ','},
    'words': [{'token_lc': 'nuže',
      'xsampa': 'nuZE',
      'morph': 'TT--------------',
      'phoebe': 'nuZe',
      'token': 'Nuže',
      'lemma': 'nuže'},
     {'token_lc': 'bratří',
      'xsampa': 'bratP\\i:',
      'morph': 'NNMP1-----A---2-',
      'phoebe': '

In [18]:
train_data, dev_data, test_data = split_train_dev_test(poems_data_one_metre_line_all_metres_recognized)

Train data cnt: 41762, Dev data cnt: 8949, Test data cnt: 8950


In [19]:
data_loader_and_saver.save_all_data(train_data, dev_data, test_data, extension)

Data saved to train_one_metre_line_all_metres_recognized.json
Data saved to dev_one_metre_line_all_metres_recognized.json
Data saved to test_one_metre_line_all_metres_recognized.json


In [20]:
train_X = get_X(train_data)
dev_X = get_X(dev_data)
test_X = get_X(test_data)

In [21]:
data_loader_and_saver.save_all_data(train_X, dev_X, test_X, f"_X{extension}")

Data saved to train_X_one_metre_line_all_metres_recognized.json
Data saved to dev_X_one_metre_line_all_metres_recognized.json
Data saved to test_X_one_metre_line_all_metres_recognized.json


In [22]:
train_y = get_y(train_data, one_metre=True)
dev_y = get_y(dev_data, one_metre=True)
test_y = get_y(test_data, one_metre=True)

In [23]:
data_loader_and_saver.save_all_data(train_y, dev_y, test_y, f"_y{extension}")

Data saved to train_y_one_metre_line_all_metres_recognized.json
Data saved to dev_y_one_metre_line_all_metres_recognized.json
Data saved to test_y_one_metre_line_all_metres_recognized.json


In [24]:
train_tokens = get_tokens(train_data)

In [25]:
data_loader_and_saver.save_data(train_tokens, f"train_tokens{extension}")

Data saved to train_tokens_one_metre_line_all_metres_recognized.json


In [26]:
train_sampa_tokens = get_sampa_tokens(train_data)

In [27]:
data_loader_and_saver.save_data(train_sampa_tokens, f"train_sampa_tokens{extension}")

Data saved to train_sampa_tokens_one_metre_line_all_metres_recognized.json
