# Create X-SAMPA syllable input dataframes

In [1]:
%load_ext dotenv
%dotenv
import os

base_dir = os.getenv("WORKING_DIR")
os.chdir(base_dir)

In [2]:
from src.data_loader_and_saver import JSONDataLoaderAndSaver, CSVDataLoaderAndSaver

json_data_loader = JSONDataLoaderAndSaver(base_dir, input_data_dir="src/data")
csv_data_saver = CSVDataLoaderAndSaver(base_dir, index_cols=["Poem idx", "Line idx", "Token idx", "Syllable idx"],
                                       output_data_dir="src/bi_lstm_crf/syllable_input/data")

In [3]:
import pandas as pd
import json

from src.kveta.sampa_syllable_parser import SampaSyllableParser
from src.kveta.syllable_class_parser import SyllableClassParser
from src.util import Util


def get_df(poems_X: list, poems_y: list) -> pd.DataFrame:
    """
    Create X-SAMPA syllable input dataframe from poems X and y dataset.
    :param poems_X: Poems X dataset
    :param poems_y: Poems y dataset
    :return: Created X-SAMPA syllable input dataframe
    """
    df = []
    sampa_parser = SampaSyllableParser()
    syllable_cls_parser = SyllableClassParser()

    for poem_idx, (poem_X, poem_y) in enumerate(zip(poems_X, poems_y)):
        for line_idx, (line_X, line_y) in enumerate(zip(poem_X, poem_y)):
            line_sonority_peaks = sampa_parser.parse_line(line_X)
            line_metre_pattern = Util.normalize_metrical_pattern(line_y["pattern"])
            line_syllable_classes = syllable_cls_parser.parse_line(line_sonority_peaks, line_X)
            metre_idx = 0

            for token_idx, (token_syllable_classes, token_X) in enumerate(zip(line_syllable_classes, line_X)):
                sampa_syllables = sampa_parser.get_syllables(token_X["xsampa"])

                for syll_idx, (syllable_class, sampa_syllable) in enumerate(zip(token_syllable_classes, sampa_syllables)):
                    data = {
                        "Poem idx": poem_idx,
                        "Line idx": line_idx,
                        "Token idx": token_idx,
                        "Syllable idx": syll_idx,
                        "Syllable class": syllable_class,
                        "Sampa syllable": sampa_syllable,
                        "Author": json.dumps(token_X["author"]),
                        "Year": Util.get_year(token_X["year"]),
                        "POS": token_X["morph"][0],
                        "Metre pos": line_metre_pattern[metre_idx],
                        "Lemma": token_X["lemma"]
                    }

                    df.append(data)
                    metre_idx += 1

    df = pd.DataFrame(df)
    df.set_index(["Poem idx", "Line idx", "Token idx", "Syllable idx"], inplace=True)

    return df

## All poems just 1 metre, no unknown metres

In [4]:
extension = "_one_metre_all_metres_recognized"

In [5]:
train_X = json_data_loader.load_data(f"train_X{extension}")
train_y = json_data_loader.load_data(f"train_y{extension}")

train_X_one_metre_all_metres_recognized.json: loaded 40137 records.
train_y_one_metre_all_metres_recognized.json: loaded 40137 records.


In [6]:
train_df = get_df(train_X, train_y)
train_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Syllable class,Sampa syllable,Author,Year,POS,Metre pos,Lemma
Poem idx,Line idx,Token idx,Syllable idx,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0,0,"SyllableClass(initial=True, final=True, conten...",tEc,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1886.0,D,W,teď
0,0,1,0,"SyllableClass(initial=True, final=True, conten...",t_so,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1886.0,P,S,co
0,0,2,0,"SyllableClass(initial=True, final=True, conten...",sE,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1886.0,P,W,se
0,0,3,0,"SyllableClass(initial=True, final=False, conte...",h\ro,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1886.0,N,S,hrozen
0,0,3,1,"SyllableClass(initial=False, final=True, conte...",znI,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1886.0,N,W,hrozen
...,...,...,...,...,...,...,...,...,...,...
40136,71,1,1,"SyllableClass(initial=False, final=True, conte...",mku,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1899.0,N,W,domek
40136,71,2,0,"SyllableClass(initial=True, final=True, conten...",na,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1899.0,R,S,na
40136,71,3,0,"SyllableClass(initial=True, final=False, conte...",u:,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1899.0,N,W,údolí
40136,71,3,1,"SyllableClass(initial=False, final=False, cont...",do,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1899.0,N,S,údolí


In [7]:
train_df.isna().sum()

Syllable class      0
Sampa syllable      0
Author              0
Year              330
POS                 0
Metre pos           0
Lemma               0
dtype: int64

In [8]:
train_df["POS"].nunique()

11

In [9]:
train_df["POS"].unique()

array(['D', 'P', 'N', 'V', 'R', 'A', 'J', 'C', 'X', 'T', 'I'],
      dtype=object)

In [10]:
train_df["Author"].nunique()

279

In [11]:
train_df["Year"].unique()

array([1886., 1880., 1899., 1884., 1892., 1908., 1823., 1821., 1965.,
       1927., 1934., 1902., 1921., 1844., 1905., 1872., 1883., 1915.,
       1935., 1891., 1827., 1906., 1834., 1936., 1881., 1858., 1926.,
       1904., 1893., 1853., 1903., 1907., 1932., 1984., 1885., 1900.,
       1897., 1962., 1882., 1896., 1888., 1901., 1933., 1938., 1930.,
       1832., 1913., 1894., 1843., 1870., 1873., 1911., 1887., 1875.,
       1944., 1865., 1857., 1928., 1898., 1912., 1874., 1859., 1918.,
       1895., 1862., 1920., 1910., 1931., 1824., 1833., 1950., 1959.,
       1914., 1863., 1919., 1890., 1812., 1889., 1868., 1957., 1946.,
       1925., 1854., 1846., 1852., 1851., 1878., 1917., 1958., 1923.,
       1864., 1922., 1956., 1909., 1822., 1939., 1951., 1948., 1937.,
       1954., 1871., 1879., 1840., 1856., 1940., 1841., 1929., 1942.,
       1916., 1830., 1876., 1941., 1820., 1924., 1836., 1963., 1869.,
       1861., 1807., 1974., 1860., 1866., 1815., 1816., 1806., 1867.,
       1847., 1850.,

In [12]:
train_df["Year"].describe()

count    1.240049e+07
mean     1.896801e+03
std      2.755570e+01
min      1.803000e+03
25%      1.883000e+03
50%      1.897000e+03
75%      1.914000e+03
max      1.984000e+03
Name: Year, dtype: float64

In [13]:
train_df["Sampa syllable"].nunique()

17197

In [14]:
train_df["Lemma"].nunique()

131910

In [15]:
csv_data_saver.save_data(train_df, f"train{extension}")

Data saved to train_one_metre_all_metres_recognized.csv


In [7]:
dev_X = json_data_loader.load_data(f"dev_X{extension}")
dev_y = json_data_loader.load_data(f"dev_y{extension}")

dev_X_one_metre_all_metres_recognized.json: loaded 8601 records.
dev_y_one_metre_all_metres_recognized.json: loaded 8601 records.


In [8]:
dev_df = get_df(dev_X, dev_y)
dev_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Syllable class,Sampa syllable,Author,Year,POS,Metre pos,Lemma
Poem idx,Line idx,Token idx,Syllable idx,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0,0,"SyllableClass(initial=True, final=True, conten...",tI,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1859,P,S,ten
0,0,1,0,"SyllableClass(initial=True, final=True, conten...",ma:,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1859,V,W,mít
0,0,2,0,"SyllableClass(initial=True, final=False, conte...",ru:,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1859,N,S,růže
0,0,2,1,"SyllableClass(initial=False, final=True, conte...",ZE,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1859,N,W,růže
0,0,3,0,"SyllableClass(initial=True, final=False, conte...",tma,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1859,A,S,tmavý
...,...,...,...,...,...,...,...,...,...,...
8600,31,2,0,"SyllableClass(initial=True, final=True, conten...",sE,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1875,P,W,se
8600,31,4,0,"SyllableClass(initial=True, final=False, conte...",I,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1875,X,S,immortelou
8600,31,4,1,"SyllableClass(initial=False, final=False, cont...",mo,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1875,X,W,immortelou
8600,31,4,2,"SyllableClass(initial=False, final=False, cont...",rtE,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1875,X,S,immortelou


In [9]:
csv_data_saver.save_data(dev_df, f"dev{extension}")

Data saved to dev_one_metre_all_metres_recognized.csv


In [10]:
test_X = json_data_loader.load_data(f"test_X{extension}")
test_y = json_data_loader.load_data(f"test_y{extension}")

test_X_one_metre_all_metres_recognized.json: loaded 8601 records.
test_y_one_metre_all_metres_recognized.json: loaded 8601 records.


In [11]:
test_df = get_df(test_X, test_y)
test_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Syllable class,Sampa syllable,Author,Year,POS,Metre pos,Lemma
Poem idx,Line idx,Token idx,Syllable idx,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,1,0,"SyllableClass(initial=True, final=False, conte...",ka,"{""born"": 1881, ""died"": 1914, ""name"": ""Gellner,...",1901,N,W,kasematy
0,0,1,1,"SyllableClass(initial=False, final=False, cont...",sE,"{""born"": 1881, ""died"": 1914, ""name"": ""Gellner,...",1901,N,S,kasematy
0,0,1,2,"SyllableClass(initial=False, final=False, cont...",ma,"{""born"": 1881, ""died"": 1914, ""name"": ""Gellner,...",1901,N,W,kasematy
0,0,1,3,"SyllableClass(initial=False, final=True, conte...",tI,"{""born"": 1881, ""died"": 1914, ""name"": ""Gellner,...",1901,N,S,kasematy
0,0,2,0,"SyllableClass(initial=True, final=True, conten...",va:s,"{""born"": 1881, ""died"": 1914, ""name"": ""Gellner,...",1901,P,W,vy
...,...,...,...,...,...,...,...,...,...,...
8600,7,1,1,"SyllableClass(initial=False, final=True, conte...",ZI,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1885,N,W,růže
8600,7,2,0,"SyllableClass(initial=True, final=False, conte...",ro,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1885,V,S,rozvít
8600,7,2,1,"SyllableClass(initial=False, final=False, cont...",zvI,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1885,V,W,rozvít
8600,7,2,2,"SyllableClass(initial=False, final=True, conte...",jE,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1885,V,S,rozvít


In [12]:
csv_data_saver.save_data(test_df, f"test{extension}")

Data saved to test_one_metre_all_metres_recognized.csv


## All lines just 1 metre, no unknown metres

In [6]:
extension = "_one_metre_line_all_metres_recognized"

In [5]:
train_X = json_data_loader.load_data(f"train_X{extension}")
train_y = json_data_loader.load_data(f"train_y{extension}")

train_X_one_metre_line_all_metres_recognized.json: loaded 41762 records.
train_y_one_metre_line_all_metres_recognized.json: loaded 41762 records.


In [6]:
train_df = get_df(train_X, train_y)
train_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Syllable class,Sampa syllable,Author,Year,POS,Metre pos,Lemma
Poem idx,Line idx,Token idx,Syllable idx,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0,0,"SyllableClass(initial=True, final=True, conten...",mu:j,"{""born"": 1845, ""died"": 1912, ""name"": ""Sl\u00e1...",1907,P,S,můj
0,0,1,0,"SyllableClass(initial=True, final=False, conte...",ko,"{""born"": 1845, ""died"": 1912, ""name"": ""Sl\u00e1...",1907,N,W,koníček
0,0,1,1,"SyllableClass(initial=False, final=False, cont...",Ji:,"{""born"": 1845, ""died"": 1912, ""name"": ""Sl\u00e1...",1907,N,S,koníček
0,0,1,2,"SyllableClass(initial=False, final=True, conte...",t_SEk,"{""born"": 1845, ""died"": 1912, ""name"": ""Sl\u00e1...",1907,N,W,koníček
0,0,2,0,"SyllableClass(initial=True, final=False, conte...",vra,"{""born"": 1845, ""died"": 1912, ""name"": ""Sl\u00e1...",1907,A,S,vraný
...,...,...,...,...,...,...,...,...,...,...
41761,3,4,0,"SyllableClass(initial=True, final=True, conten...",t_SI,"{""born"": 1864, ""died"": 1942, ""name"": ""Machar, ...",1935,J,S,či
41761,3,5,0,"SyllableClass(initial=True, final=True, conten...",ZE,"{""born"": 1864, ""died"": 1942, ""name"": ""Machar, ...",1935,J,W,že
41761,3,6,0,"SyllableClass(initial=True, final=True, conten...",jsEm,"{""born"": 1864, ""died"": 1942, ""name"": ""Machar, ...",1935,V,S,být
41761,3,7,0,"SyllableClass(initial=True, final=True, conten...",P\Ek,"{""born"": 1864, ""died"": 1942, ""name"": ""Machar, ...",1935,V,W,říci


In [7]:
csv_data_saver.save_data(train_df, f"train{extension}")

Data saved to train_one_metre_line_all_metres_recognized.csv


In [7]:
dev_X = json_data_loader.load_data(f"dev_X{extension}")
dev_y = json_data_loader.load_data(f"dev_y{extension}")

dev_X_one_metre_line_all_metres_recognized.json: loaded 8949 records.
dev_y_one_metre_line_all_metres_recognized.json: loaded 8949 records.


In [8]:
dev_df = get_df(dev_X, dev_y)
dev_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Syllable class,Sampa syllable,Author,Year,POS,Metre pos,Lemma
Poem idx,Line idx,Token idx,Syllable idx,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0,0,"SyllableClass(initial=True, final=True, conten...",t_sos,"{""born"": 1860, ""died"": 1939, ""name"": ""Haasz, J...",1918.0,X,W,Co´s
0,0,1,0,"SyllableClass(initial=True, final=False, conte...",bo,"{""born"": 1860, ""died"": 1939, ""name"": ""Haasz, J...",1918.0,N,S,bůh
0,0,1,1,"SyllableClass(initial=False, final=True, conte...",ZE,"{""born"": 1860, ""died"": 1939, ""name"": ""Haasz, J...",1918.0,N,W,bůh
0,0,2,0,"SyllableClass(initial=True, final=False, conte...",kvjE,"{""born"": 1860, ""died"": 1939, ""name"": ""Haasz, J...",1918.0,N,S,květ
0,0,2,1,"SyllableClass(initial=False, final=True, conte...",tu:,"{""born"": 1860, ""died"": 1939, ""name"": ""Haasz, J...",1918.0,N,W,květ
...,...,...,...,...,...,...,...,...,...,...
8948,10,3,1,"SyllableClass(initial=False, final=False, cont...",ltu,"{""born"": 1833, ""died"": 1875, ""name"": ""Pfleger ...",1859.0,N,W,kultura
8948,10,3,2,"SyllableClass(initial=False, final=True, conte...",rI,"{""born"": 1833, ""died"": 1875, ""name"": ""Pfleger ...",1859.0,N,S,kultura
8948,10,4,0,"SyllableClass(initial=True, final=True, conten...",zdE,"{""born"": 1833, ""died"": 1875, ""name"": ""Pfleger ...",1859.0,D,W,zde
8948,10,5,0,"SyllableClass(initial=True, final=False, conte...",fsto_u,"{""born"": 1833, ""died"": 1875, ""name"": ""Pfleger ...",1859.0,V,S,vstoupit


In [9]:
csv_data_saver.save_data(dev_df, f"dev{extension}")

Data saved to dev_one_metre_line_all_metres_recognized.csv


In [10]:
test_X = json_data_loader.load_data(f"test_X{extension}")
test_y = json_data_loader.load_data(f"test_y{extension}")

test_X_one_metre_line_all_metres_recognized.json: loaded 8950 records.
test_y_one_metre_line_all_metres_recognized.json: loaded 8950 records.


In [11]:
test_df = get_df(test_X, test_y)
test_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Syllable class,Sampa syllable,Author,Year,POS,Metre pos,Lemma
Poem idx,Line idx,Token idx,Syllable idx,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0,0,"SyllableClass(initial=True, final=True, conten...",t_so,"{""born"": 1855, ""died"": 1934, ""name"": ""Kalus, J...",1882,P,S,co
0,0,1,0,"SyllableClass(initial=True, final=True, conten...",sE,"{""born"": 1855, ""died"": 1934, ""name"": ""Kalus, J...",1882,P,W,se
0,0,2,0,"SyllableClass(initial=True, final=False, conte...",ma,"{""born"": 1855, ""died"": 1934, ""name"": ""Kalus, J...",1882,N,S,matka
0,0,2,1,"SyllableClass(initial=False, final=True, conte...",tka,"{""born"": 1855, ""died"": 1934, ""name"": ""Kalus, J...",1882,N,W,matka
0,0,3,0,"SyllableClass(initial=True, final=False, conte...",nE,"{""born"": 1855, ""died"": 1934, ""name"": ""Kalus, J...",1882,V,S,starat
...,...,...,...,...,...,...,...,...,...,...
8949,33,3,0,"SyllableClass(initial=True, final=False, conte...",mo,"{""born"": 1857, ""died"": 1937, ""name"": ""Kysel\u0...",1923,V,S,moci
8949,33,3,1,"SyllableClass(initial=False, final=True, conte...",h\l=,"{""born"": 1857, ""died"": 1937, ""name"": ""Kysel\u0...",1923,V,W,moci
8949,33,4,0,"SyllableClass(initial=True, final=False, conte...",tva:,"{""born"": 1857, ""died"": 1937, ""name"": ""Kysel\u0...",1923,N,S,tvář
8949,33,4,1,"SyllableClass(initial=False, final=True, conte...",P\i:,"{""born"": 1857, ""died"": 1937, ""name"": ""Kysel\u0...",1923,N,W,tvář


In [12]:
csv_data_saver.save_data(test_df, f"test{extension}")

Data saved to test_one_metre_line_all_metres_recognized.csv
