In [7]:
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset

In [None]:
inputs = np.load("data/input_train.npy")
# targets = np.load("data/target_train.npy") # tf
targets = np.load("data/target_train_int.npy") # pytorch
print("Done loading data.")

In [None]:
print(inputs.shape)
print(targets.shape)

In [None]:
# convert one-hot format targets to integer format
# targets = np.argmax(targets, axis=1)
# print(targets.shape)
# np.save("data/target_train_int.npy", targets)

In [None]:
inputs_tensor = torch.tensor(inputs, dtype=torch.long)
targets_tensor = torch.tensor(targets, dtype=torch.long)

dataset = TensorDataset(inputs_tensor, targets_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
count = 0
for batch in dataloader:
    inputs, targets = batch
    print(targets)
    count += 1
    if count >= 1:
        break

### read_dep_relations()

In [None]:
from dep_utils import conll_reader

def read_dep_relations():
    dep_relations = []

    input_files = ['data/train.conll', 'data/dev.conll', 'data/test.conll']
    for input_file in input_files:
        with open(input_file, 'r') as f:
            train_trees = list(conll_reader(f))
        for tree in train_trees:
            for k, v in tree.deprels.items():
                if v.deprel not in dep_relations:
                    dep_relations.append(v.deprel)

    return dep_relations

dep_relations = read_dep_relations()
print(len(dep_relations))

In [None]:
# The original extract_training_data.py contains a constant `dep_relations`, which is a hard-coded list of dependency relations
# check the difference between the above extracted dep_relations and the original dep_relations

dep_relations_new = read_dep_relations()

dep_relations_old = dep_relations = [
    "tmod", "vmod","csubjpass","rcmod","ccomp","poss","parataxis","appos","dep","iobj","pobj","mwe","quantmod","acomp","number","csubj","root","auxpass","prep","mark","expl","cc","npadvmod","prt","nsubj","advmod","conj","advcl","punct","aux","pcomp","discourse","nsubjpass","predet","cop","possessive","nn","xcomp","preconj","num","amod","dobj","neg","dt","det"]

print(f'len(dep_relations_new) = {len(dep_relations_new)}')
print(f'len(dep_relations_old) = {len(dep_relations_old)}')

intersection = set(dep_relations_new).intersection(dep_relations_old)
print(f'len(intersection) = {len(intersection)}')

In [None]:
# what is in new but not in old
print(set(dep_relations_new) - set(dep_relations_old))

In [None]:
# what is in old but not in new
print(set(dep_relations_old) - set(dep_relations_new))

## test TF version of extractor

In [1]:
from extract_training_data import FeatureExtractor, get_training_matrices, get_training_instances

WORD_VOCAB_FILE = "data/words.vocab"
POS_VOCAB_FILE = "data/pos.vocab"

try:
    word_vocab_f = open(WORD_VOCAB_FILE, "r")
    pos_vocab_f = open(POS_VOCAB_FILE, "r")
except FileNotFoundError:
    print(
        "Could not find vocabulary files {} and {}".format(
            WORD_VOCAB_FILE, POS_VOCAB_FILE
        )
    )

In [2]:
extractor = FeatureExtractor(word_vocab_f, pos_vocab_f)

In [3]:
with open("data/train.conll", "r") as in_file:
    extractor.output_format = 'tf'
    _, outputs_tf = get_training_matrices(extractor, in_file, n=10)

 90%|█████████ | 9/10 [00:00<00:00, 720.11it/s]


In [5]:
print(len(outputs_tf))
print(type(outputs_tf[0]))
print(outputs_tf[0].shape)

378
<class 'numpy.ndarray'>
(79,)


In [9]:
# outputs_tf = np.stack(outputs_tf)
print(outputs_tf.shape)

(378, 79)


## Test if the pt version from extract_training_data.py is in correct format

In [11]:
outputs_pt = np.load('data/target_train.npy')
print(outputs_pt.shape)

outputs_pt_gt = np.load('data/target_train_int.npy')
print(outputs_pt_gt.shape)

(1899270, 1)
(1899270,)


In [13]:
import datetime
print(datetime.datetime.now())

2025-04-15 17:04:55.210554


## Test lab8_solutions.py

Test if the `get_training_instances` function in `lab8_solutions.py` produces same output as that in `extract_training_data.py`

In [3]:
from lab8_solutions import get_training_instances as get_training_instances_lab8
from extract_training_data import get_training_instances as get_training_instances_original
from dep_utils import conll_reader

with open("data/train.conll", "r") as in_file:
    train_trees = list(conll_reader(in_file))

seq_lab8 = get_training_instances_lab8(train_trees[0])
seq_original = get_training_instances_original(train_trees[0])

# print(seq_lab8)
# print(seq_original)

In [None]:
print('seq_lab8', len(seq_lab8))
print('seq_original', len(seq_original))

seq_lab8 99
seq_original 98


In [5]:
print(seq_lab8[-1])
print(seq_original[-1])

(([0],[],{(20, 17, 'punct'), (5, 3, 'compound'), (5, 20, 'dep'), (15, 14, 'compound'), (20, 22, 'dobj'), (41, 40, 'compound'), (45, 34, 'nsubjpass'), (25, 23, 'case'), (9, 8, 'det'), (28, 30, 'conj'), (9, 10, 'punct'), (45, 48, 'nmod'), (34, 36, 'nmod'), (9, 15, 'nmod'), (34, 42, 'punct'), (20, 27, 'punct'), (15, 11, 'case'), (5, 9, 'nmod'), (45, 32, 'punct'), (48, 47, 'compound'), (9, 7, 'punct'), (41, 39, 'case'), (19, 18, 'amod'), (28, 29, 'cc'), (0, 45, 'root'), (5, 2, 'det'), (22, 21, 'det'), (20, 16, 'punct'), (20, 19, 'nsubj'), (5, 1, 'case'), (34, 33, 'det'), (20, 25, 'nmod'), (12, 13, 'case'), (20, 31, 'punct'), (45, 43, 'auxpass'), (34, 37, 'punct'), (38, 41, 'nmod'), (45, 5, 'nmod'), (45, 49, 'punct'), (34, 38, 'acl'), (36, 35, 'case'), (45, 44, 'advmod'), (5, 4, 'nummod'), (25, 24, 'compound'), (15, 12, 'nmod:poss'), (20, 28, 'dep'), (48, 46, 'case'), (20, 26, 'punct'), (9, 6, 'case')}), ('done', None))
([],[0],{(20, 17, 'punct'), (5, 3, 'compound'), (5, 20, 'dep'), (15, 14

In [8]:
print(len(seq_lab8[0]))
print(len(seq_original[0]))

for i in range(len(seq_lab8)):
    if i < len(seq_original):
        assert len(seq_lab8[i]) == len(seq_original[i]), f"len(seq_lab8[{i}]) = {len(seq_lab8[i])}, len(seq_original[{i}]) = {len(seq_original[i])}"

2
2


In [12]:
print(type(seq_lab8[0][0]))
print(type(seq_lab8[0][1]))

print(type(seq_original[0][0]))
print(type(seq_original[0][1]))

<class 'lab8_solutions.State'>
<class 'tuple'>
<class 'extract_training_data.State'>
<class 'tuple'>
