# Create token input dataframes

In [1]:
%load_ext dotenv
%dotenv
import os

base_dir = os.getenv("WORKING_DIR")
os.chdir(base_dir)

In [2]:
from src.data_loader_and_saver import JSONDataLoaderAndSaver, CSVDataLoaderAndSaver

json_data_loader = JSONDataLoaderAndSaver(base_dir, input_data_dir="src/data")
csv_data_saver = CSVDataLoaderAndSaver(base_dir, index_cols=["Poem idx", "Line idx", "Token idx"], output_data_dir="src/bi_lstm_crf/token_input/data")

In [3]:
import pandas as pd
import json

from src.kveta.sampa_syllable_parser import SampaSyllableParser
from src.util import Util

EMPTY_POS = "_"


def get_df(poems_X: list, poems_y: list) -> pd.DataFrame:
    """
    Create token input dataframe from poems X and y dataset.
    :param poems_X: Poems X dataset
    :param poems_y: Poems y dataset
    :return: Created token input dataframe
    """
    df = []
    sampa_parser = SampaSyllableParser()

    for poem_idx, (poem_X, poem_y) in enumerate(zip(poems_X, poems_y)):
        for line_idx, (line_X, line_y) in enumerate(zip(poem_X, poem_y)):
            line_metre_pattern = Util.normalize_metrical_pattern(line_y["pattern"])

            for token_idx, token_X in enumerate(line_X):
                syllable_cnt = sampa_parser.get_syllable_cnt(token_X["xsampa"])

                if syllable_cnt > 0:
                    metre_pos = line_metre_pattern[:syllable_cnt]
                    line_metre_pattern = line_metre_pattern[syllable_cnt:]
                else:
                    metre_pos = EMPTY_POS

                data = {
                    "Poem idx": poem_idx,
                    "Line idx": line_idx,
                    "Token idx": token_idx,
                    "Token": token_X["token_lc"],
                    "Author": json.dumps(token_X["author"]),
                    "Year": Util.get_year(token_X["year"]),
                    "POS": token_X["morph"][0],
                    "Lemma": token_X["lemma"],
                    "Metre pos": metre_pos
                }

                df.append(data)

    df = pd.DataFrame(df)
    df.set_index(["Poem idx", "Line idx", "Token idx"], inplace=True)

    return df

## All poems just 1 metre, no unknown metres

In [5]:
extension = "_one_metre_all_metres_recognized"

In [6]:
train_X = json_data_loader.load_data(f"train_X{extension}")
train_y = json_data_loader.load_data(f"train_y{extension}")

train_X_one_metre_all_metres_recognized.json: loaded 40137 records.
train_y_one_metre_all_metres_recognized.json: loaded 40137 records.


In [7]:
train_df = get_df(train_X, train_y)
train_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Token,Author,Year,POS,Lemma,Metre pos
Poem idx,Line idx,Token idx,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,0,teď,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1886.0,D,teď,W
0,0,1,co,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1886.0,P,co,S
0,0,2,se,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1886.0,P,se,W
0,0,3,hrozny,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1886.0,N,hrozen,SW
0,0,4,zlatem,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1886.0,N,zlato,SW
...,...,...,...,...,...,...,...,...
40136,70,5,přece,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1899.0,D,přece,SW
40136,71,0,chudákova,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1899.0,A,chudákův,SWSW
40136,71,1,domku,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1899.0,N,domek,SW
40136,71,2,na,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1899.0,R,na,S


In [8]:
train_df.isna().sum()

Token          0
Author         0
Year         191
POS            0
Lemma          0
Metre pos      0
dtype: int64

In [9]:
train_df["POS"].nunique()

11

In [10]:
train_df["POS"].unique()

array(['D', 'P', 'N', 'V', 'R', 'A', 'J', 'C', 'X', 'T', 'I'],
      dtype=object)

In [11]:
train_df["Author"].nunique()

279

In [12]:
train_df["Year"].unique()

array([1886., 1880., 1899., 1884., 1892., 1908., 1823., 1821., 1965.,
       1927., 1934., 1902., 1921., 1844., 1905., 1872., 1883., 1915.,
       1935., 1891., 1827., 1906., 1834., 1936., 1881., 1858., 1926.,
       1904., 1893., 1853., 1903., 1907., 1932., 1984., 1885., 1900.,
       1897., 1962., 1882., 1896., 1888., 1901., 1933., 1938., 1930.,
       1832., 1913., 1894., 1843., 1870., 1873., 1911., 1887., 1875.,
       1944., 1865., 1857., 1928., 1898., 1912., 1874., 1859., 1918.,
       1895., 1862., 1920., 1910., 1931., 1824., 1833., 1950., 1959.,
       1914., 1863., 1919., 1890., 1812., 1889., 1868., 1957., 1946.,
       1925., 1854., 1846., 1852., 1851., 1878., 1917., 1958., 1923.,
       1864., 1922., 1956., 1909., 1822., 1939., 1951., 1948., 1937.,
       1954., 1871., 1879., 1840., 1856., 1940., 1841., 1929., 1942.,
       1916., 1830., 1876., 1941., 1820., 1924., 1836., 1963., 1869.,
       1861., 1807., 1974., 1860., 1866., 1815., 1816., 1806., 1867.,
       1847., 1850.,

In [13]:
train_df["Year"].describe()

count    7.240664e+06
mean     1.897109e+03
std      2.706662e+01
min      1.803000e+03
25%      1.884000e+03
50%      1.897000e+03
75%      1.914000e+03
max      1.984000e+03
Name: Year, dtype: float64

In [14]:
train_df["Token"].nunique()

279392

In [15]:
train_df["Lemma"].nunique()

131925

In [16]:
csv_data_saver.save_data(train_df, f"train{extension}")

Data saved to train_one_metre_all_metres_recognized.csv


In [17]:
dev_X = json_data_loader.load_data(f"dev_X{extension}")
dev_y = json_data_loader.load_data(f"dev_y{extension}")

dev_X_one_metre_all_metres_recognized.json: loaded 8601 records.
dev_y_one_metre_all_metres_recognized.json: loaded 8601 records.


In [18]:
dev_df = get_df(dev_X, dev_y)
dev_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Token,Author,Year,POS,Lemma,Metre pos
Poem idx,Line idx,Token idx,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,0,ty,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1859,P,ten,S
0,0,1,má,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1859,V,mít,W
0,0,2,růže,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1859,N,růže,SW
0,0,3,tmavá,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1859,A,tmavý,SW
0,1,0,jak,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1859,D,jak,S
...,...,...,...,...,...,...,...,...
8600,31,0,já,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1875,P,já,W
8600,31,1,spokojím,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1875,V,spokojit,SWS
8600,31,2,se,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1875,P,se,W
8600,31,3,s,"{""born"": 1853, ""died"": 1912, ""name"": ""Vrchlick...",1875,R,s,_


In [19]:
csv_data_saver.save_data(dev_df, f"dev{extension}")

Data saved to dev_one_metre_all_metres_recognized.csv


In [20]:
test_X = json_data_loader.load_data(f"test_X{extension}")
test_y = json_data_loader.load_data(f"test_y{extension}")

test_X_one_metre_all_metres_recognized.json: loaded 8601 records.
test_y_one_metre_all_metres_recognized.json: loaded 8601 records.


In [21]:
test_df = get_df(test_X, test_y)
test_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Token,Author,Year,POS,Lemma,Metre pos
Poem idx,Line idx,Token idx,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,0,z,"{""born"": 1881, ""died"": 1914, ""name"": ""Gellner,...",1901,R,z,_
0,0,1,kasematy,"{""born"": 1881, ""died"": 1914, ""name"": ""Gellner,...",1901,N,kasematy,WSWS
0,0,2,vás,"{""born"": 1881, ""died"": 1914, ""name"": ""Gellner,...",1901,P,vy,W
0,0,3,věznící,"{""born"": 1881, ""died"": 1914, ""name"": ""Gellner,...",1901,A,věznící,SWS
0,1,0,v,"{""born"": 1881, ""died"": 1914, ""name"": ""Gellner,...",1901,R,v,_
...,...,...,...,...,...,...,...,...
8600,6,4,nenadání,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1885,N,nenadání,SWSW
8600,7,0,v,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1885,R,v,_
8600,7,1,růži,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1885,N,růže,SW
8600,7,2,rozvije,"{""born"": 1835, ""died"": 1923, ""name"": ""Heyduk, ...",1885,V,rozvít,SWS


In [22]:
csv_data_saver.save_data(test_df, f"test{extension}")

Data saved to test_one_metre_all_metres_recognized.csv


## All lines just 1 metre, no unknown metres

In [4]:
extension = "_one_metre_line_all_metres_recognized"

In [5]:
train_X = json_data_loader.load_data(f"train_X{extension}")
train_y = json_data_loader.load_data(f"train_y{extension}")

train_X_one_metre_line_all_metres_recognized.json: loaded 41762 records.
train_y_one_metre_line_all_metres_recognized.json: loaded 41762 records.


In [6]:
train_df = get_df(train_X, train_y)
train_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Token,Author,Year,POS,Lemma,Metre pos
Poem idx,Line idx,Token idx,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,0,můj,"{""born"": 1845, ""died"": 1912, ""name"": ""Sl\u00e1...",1907,P,můj,S
0,0,1,koníček,"{""born"": 1845, ""died"": 1912, ""name"": ""Sl\u00e1...",1907,N,koníček,WSW
0,0,2,vraný,"{""born"": 1845, ""died"": 1912, ""name"": ""Sl\u00e1...",1907,A,vraný,SW
0,1,0,jako,"{""born"": 1845, ""died"": 1912, ""name"": ""Sl\u00e1...",1907,J,jako,SW
0,1,1,malovaný,"{""born"": 1845, ""died"": 1912, ""name"": ""Sl\u00e1...",1907,A,malovaný,SWSW
...,...,...,...,...,...,...,...,...
41761,3,4,či,"{""born"": 1864, ""died"": 1942, ""name"": ""Machar, ...",1935,J,či,S
41761,3,5,že,"{""born"": 1864, ""died"": 1942, ""name"": ""Machar, ...",1935,J,že,W
41761,3,6,jsem,"{""born"": 1864, ""died"": 1942, ""name"": ""Machar, ...",1935,V,být,S
41761,3,7,řek,"{""born"": 1864, ""died"": 1942, ""name"": ""Machar, ...",1935,V,říci,W


In [7]:
csv_data_saver.save_data(train_df, f"train{extension}")

Data saved to train_one_metre_line_all_metres_recognized.csv


In [8]:
dev_X = json_data_loader.load_data(f"dev_X{extension}")
dev_y = json_data_loader.load_data(f"dev_y{extension}")

dev_X_one_metre_line_all_metres_recognized.json: loaded 8949 records.
dev_y_one_metre_line_all_metres_recognized.json: loaded 8949 records.


In [9]:
dev_df = get_df(dev_X, dev_y)
dev_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Token,Author,Year,POS,Lemma,Metre pos
Poem idx,Line idx,Token idx,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,0,cos,"{""born"": 1860, ""died"": 1939, ""name"": ""Haasz, J...",1918.0,X,Co´s,W
0,0,1,bože,"{""born"": 1860, ""died"": 1939, ""name"": ""Haasz, J...",1918.0,N,bůh,SW
0,0,2,květů,"{""born"": 1860, ""died"": 1939, ""name"": ""Haasz, J...",1918.0,N,květ,SW
0,0,3,na,"{""born"": 1860, ""died"": 1939, ""name"": ""Haasz, J...",1918.0,R,na,S
0,0,4,té,"{""born"": 1860, ""died"": 1939, ""name"": ""Haasz, J...",1918.0,P,ten,W
...,...,...,...,...,...,...,...,...
8948,10,1,pod,"{""born"": 1833, ""died"": 1875, ""name"": ""Pfleger ...",1859.0,R,pod,S
8948,10,2,praporec,"{""born"": 1833, ""died"": 1875, ""name"": ""Pfleger ...",1859.0,N,praporec,WSW
8948,10,3,kultury,"{""born"": 1833, ""died"": 1875, ""name"": ""Pfleger ...",1859.0,N,kultura,SWS
8948,10,4,zde,"{""born"": 1833, ""died"": 1875, ""name"": ""Pfleger ...",1859.0,D,zde,W


In [10]:
csv_data_saver.save_data(dev_df, f"dev{extension}")

Data saved to dev_one_metre_line_all_metres_recognized.csv


In [11]:
test_X = json_data_loader.load_data(f"test_X{extension}")
test_y = json_data_loader.load_data(f"test_y{extension}")

test_X_one_metre_line_all_metres_recognized.json: loaded 8950 records.
test_y_one_metre_line_all_metres_recognized.json: loaded 8950 records.


In [12]:
test_df = get_df(test_X, test_y)
test_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Token,Author,Year,POS,Lemma,Metre pos
Poem idx,Line idx,Token idx,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,0,co,"{""born"": 1855, ""died"": 1934, ""name"": ""Kalus, J...",1882,P,co,S
0,0,1,se,"{""born"": 1855, ""died"": 1934, ""name"": ""Kalus, J...",1882,P,se,W
0,0,2,matka,"{""born"": 1855, ""died"": 1934, ""name"": ""Kalus, J...",1882,N,matka,SW
0,0,3,nestarala,"{""born"": 1855, ""died"": 1934, ""name"": ""Kalus, J...",1882,V,starat,SWSW
0,1,0,dlouho,"{""born"": 1855, ""died"": 1934, ""name"": ""Kalus, J...",1882,D,dlouho,SW
...,...,...,...,...,...,...,...,...
8949,33,2,naň,"{""born"": 1857, ""died"": 1937, ""name"": ""Kysel\u0...",1923,P,naň,W
8949,33,3,mohl,"{""born"": 1857, ""died"": 1937, ""name"": ""Kysel\u0...",1923,V,moci,SW
8949,33,4,tváří,"{""born"": 1857, ""died"": 1937, ""name"": ""Kysel\u0...",1923,N,tvář,SW
8949,33,5,v,"{""born"": 1857, ""died"": 1937, ""name"": ""Kysel\u0...",1923,R,v,_


In [13]:
csv_data_saver.save_data(test_df, f"test{extension}")

Data saved to test_one_metre_line_all_metres_recognized.csv
