# Create token input CoNLL datasets for the BiLSTM-CRF model

In [1]:
%load_ext dotenv
%dotenv
import os

base_dir = os.getenv("WORKING_DIR")
os.chdir(base_dir)

In [2]:
from src.data_loader_and_saver import CSVDataLoaderAndSaver

data_loader = CSVDataLoaderAndSaver(base_dir, input_data_dir="src/bi_lstm_crf/token_input/data", index_cols=["Poem idx", "Line idx", "Token idx"])

In [3]:
from src.bi_lstm_crf.conll_data_saver import CoNLLDataSaver

## All poems just 1 metre, no unknown metres

In [4]:
extension = "_one_metre_all_metres_recognized"

In [5]:
conll_saver = CoNLLDataSaver(base_dir, f"resources/emnlp2017-bilstm-cnn-crf/data/ccv_tokens{extension}")

In [6]:
train_df, dev_df, test_df = data_loader.load_all_data(extension)

train_one_metre_all_metres_recognized.csv: loaded 7240855 records.
dev_one_metre_all_metres_recognized.csv: loaded 1554844 records.
test_one_metre_all_metres_recognized.csv: loaded 1606108 records.


In [7]:
conll_saver.save_all_data(train_df, dev_df, test_df)

CoNLL dataset saved to train.txt
CoNLL dataset saved to dev.txt
CoNLL dataset saved to test.txt


## All poems just 1 metre, no unknown metres, poem as line

In [4]:
input_extension = "_one_metre_all_metres_recognized"
output_extension = f"{input_extension}_poem_line"

In [5]:
conll_saver = CoNLLDataSaver(base_dir, f"resources/emnlp2017-bilstm-cnn-crf/data/ccv_tokens{output_extension}", poem_line=True)

In [6]:
train_df, dev_df, test_df = data_loader.load_all_data(input_extension)

train_one_metre_all_metres_recognized.csv: loaded 7240855 records.
dev_one_metre_all_metres_recognized.csv: loaded 1554844 records.
test_one_metre_all_metres_recognized.csv: loaded 1606108 records.


In [7]:
conll_saver.save_all_data(train_df, dev_df, test_df)

CoNLL dataset saved to train.txt
CoNLL dataset saved to dev.txt
CoNLL dataset saved to test.txt


## All lines just 1 metre, no unknown metres

In [12]:
extension = "_one_metre_line_all_metres_recognized"

In [13]:
conll_saver = CoNLLDataSaver(base_dir, f"resources/emnlp2017-bilstm-cnn-crf/data/ccv_tokens{extension}")

In [14]:
train_df, dev_df, test_df = data_loader.load_all_data(extension)

train_one_metre_line_all_metres_recognized.csv: loaded 7823727 records.
dev_one_metre_line_all_metres_recognized.csv: loaded 1730053 records.
test_one_metre_line_all_metres_recognized.csv: loaded 1646197 records.


In [15]:
conll_saver.save_all_data(train_df, dev_df, test_df)

CoNLL dataset saved to train.txt
CoNLL dataset saved to dev.txt
CoNLL dataset saved to test.txt


## All lines just 1 metre, no unknown metres, poem as line

In [16]:
input_extension = "_one_metre_line_all_metres_recognized"
output_extension = f"{input_extension}_poem_line"

In [17]:
conll_saver = CoNLLDataSaver(base_dir, f"resources/emnlp2017-bilstm-cnn-crf/data/ccv_tokens{output_extension}", poem_line=True)

In [18]:
train_df, dev_df, test_df = data_loader.load_all_data(input_extension)

train_one_metre_line_all_metres_recognized.csv: loaded 7823727 records.
dev_one_metre_line_all_metres_recognized.csv: loaded 1730053 records.
test_one_metre_line_all_metres_recognized.csv: loaded 1646197 records.


In [19]:
conll_saver.save_all_data(train_df, dev_df, test_df)

CoNLL dataset saved to train.txt
CoNLL dataset saved to dev.txt
CoNLL dataset saved to test.txt
