In [7]:
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset

In [None]:
inputs = np.load("data/input_train.npy")
# targets = np.load("data/target_train.npy") # tf
targets = np.load("data/target_train_int.npy") # pytorch
print("Done loading data.")

In [None]:
print(inputs.shape)
print(targets.shape)

In [None]:
# convert one-hot format targets to integer format
# targets = np.argmax(targets, axis=1)
# print(targets.shape)
# np.save("data/target_train_int.npy", targets)

In [None]:
inputs_tensor = torch.tensor(inputs, dtype=torch.long)
targets_tensor = torch.tensor(targets, dtype=torch.long)

dataset = TensorDataset(inputs_tensor, targets_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
count = 0
for batch in dataloader:
    inputs, targets = batch
    print(targets)
    count += 1
    if count >= 1:
        break

### read_dep_relations()

In [None]:
from dep_utils import conll_reader

def read_dep_relations():
    dep_relations = []

    input_files = ['data/train.conll', 'data/dev.conll', 'data/test.conll']
    for input_file in input_files:
        with open(input_file, 'r') as f:
            train_trees = list(conll_reader(f))
        for tree in train_trees:
            for k, v in tree.deprels.items():
                if v.deprel not in dep_relations:
                    dep_relations.append(v.deprel)

    return dep_relations

dep_relations = read_dep_relations()
print(len(dep_relations))

In [None]:
# The original extract_training_data.py contains a constant `dep_relations`, which is a hard-coded list of dependency relations
# check the difference between the above extracted dep_relations and the original dep_relations

dep_relations_new = read_dep_relations()

dep_relations_old = dep_relations = [
    "tmod", "vmod","csubjpass","rcmod","ccomp","poss","parataxis","appos","dep","iobj","pobj","mwe","quantmod","acomp","number","csubj","root","auxpass","prep","mark","expl","cc","npadvmod","prt","nsubj","advmod","conj","advcl","punct","aux","pcomp","discourse","nsubjpass","predet","cop","possessive","nn","xcomp","preconj","num","amod","dobj","neg","dt","det"]

print(f'len(dep_relations_new) = {len(dep_relations_new)}')
print(f'len(dep_relations_old) = {len(dep_relations_old)}')

intersection = set(dep_relations_new).intersection(dep_relations_old)
print(f'len(intersection) = {len(intersection)}')

In [None]:
# what is in new but not in old
print(set(dep_relations_new) - set(dep_relations_old))

In [None]:
# what is in old but not in new
print(set(dep_relations_old) - set(dep_relations_new))

## test TF version of extractor

In [1]:
from extract_training_data import FeatureExtractor, get_training_matrices, get_training_instances

WORD_VOCAB_FILE = "data/words.vocab"
POS_VOCAB_FILE = "data/pos.vocab"

try:
    word_vocab_f = open(WORD_VOCAB_FILE, "r")
    pos_vocab_f = open(POS_VOCAB_FILE, "r")
except FileNotFoundError:
    print(
        "Could not find vocabulary files {} and {}".format(
            WORD_VOCAB_FILE, POS_VOCAB_FILE
        )
    )

In [2]:
extractor = FeatureExtractor(word_vocab_f, pos_vocab_f)

In [3]:
with open("data/train.conll", "r") as in_file:
    extractor.output_format = 'tf'
    _, outputs_tf = get_training_matrices(extractor, in_file, n=10)

 90%|█████████ | 9/10 [00:00<00:00, 720.11it/s]


In [5]:
print(len(outputs_tf))
print(type(outputs_tf[0]))
print(outputs_tf[0].shape)

378
<class 'numpy.ndarray'>
(79,)


In [9]:
# outputs_tf = np.stack(outputs_tf)
print(outputs_tf.shape)

(378, 79)


## Test if the pt version from extract_training_data.py is in correct format

In [11]:
outputs_pt = np.load('data/target_train.npy')
print(outputs_pt.shape)

outputs_pt_gt = np.load('data/target_train_int.npy')
print(outputs_pt_gt.shape)

(1899270, 1)
(1899270,)


In [13]:
import datetime
print(datetime.datetime.now())

2025-04-15 17:04:55.210554
