In [36]:
import torch
import sklearn
import itertools
import pandas as pd
import numpy as np
from transformers import AutoTokenizer

In [103]:
class Dataset(torch.utils.data.Dataset):

    # The var segmented_word will be a list of segmented words ex. un@bound@ed
    def __init__(self, segmented_words):
        self._segmented_words = segmented_words
        self._items = {}

        # Loops over each segmented word.
        for w in segmented_words:
            # Splits the word by @ and returns a list of morphemes: "un@bound@ed" ---> ['un', 'bound', 'ed']
            segments = w.split("@")

            # Var word will represent the unsegmented word.
            word = "".join(segments)

            # in between characters, there are len-1 posibilities.
            # Creates a list of char lengs for each morpheme.
            segment_lens = [len(s) for s in segments]

            # This keeps track of where the gold segmentation indexes are in the string.
            # ['un', 'bound', 'ed'] ---> {2, 7}. A word bound could be inserted at either idx
            # 2 or 7 for a correct label.
            hyphen_pos = set(itertools.accumulate(segment_lens[:-1]))


            # For instance: word == "unbounded", range is 1, 9.
            for i in range(1, len(word)):
                text = f"{word} {word_separator_token} {word[:i]}{morph_boundary_token}{word[i:]}"

                # Checks if current value of i(1-len(word)) (representing current segmentation in text)
                # is a valid segmentation in comparison to the gold standard.
                # Returns 0 if False.
                # Returns 1 if True.
                label = int(i in hyphen_pos)
                
                value_dict = {"text": text, "label": label}
                value_dict.update(tokenizer(text))
                # Each item will be one word2example segmentation, label, attention mask 
                self._items[len(self._items)] = value_dict
        super()

    def get_class_weights(self):
        weights = sklearn.utils.class_weight.compute_class_weight(
            "balanced", classes=np.array([0, 1]), y=[i["label"] for i in self._items.values()]
        )
        return weights

    def __len__(self):
        return len(self._items)

    def __getitem__(self, idx):
        return self._items[idx]

In [104]:
path2data_dir = "/home/mathias/Desktop/HI/hpc/inuktitut/llm_segm/reimplementation/data/"
#path2out_dir = "/home/mathias/Desktop/HI/hpc/inuktitut/llm_segm/reimplementation/out/"
lang = "iu"
model_name = "cis-lmu/glot500-base"

In [52]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
word_separator_token = "<__word-separator>"
morph_boundary_token = "<__morph-boundary>"
assert tokenizer.add_tokens([morph_boundary_token, word_separator_token])

In [105]:
data_train = pd.read_csv(path2data_dir + "train.csv", sep="\t", header=None)
#data_train = pd.read_csv(path2data_dir + "train_small.csv", sep="\t", header=None)

In [106]:
dataset_train = Dataset(data_train[1].to_list())

In [107]:
len(dataset_train)

423773

In [108]:
y = [i["label"] for i in dataset_train._items.values()]

In [111]:
class_weights = dataset_train.get_class_weights()

In [112]:
class_weights

array([0.84367186, 1.22743809])

In [102]:
dataset_train._items

{0: {'text': 'ᑕᕝᕙᐅᖏᒻᒪᑦ <__word-separator> ᑕ<__morph-boundary>ᕝᕙᐅᖏᒻᒪᑦ',
  'label': 0,
  'input_ids': [0,
   293436,
   251953,
   252233,
   254839,
   6,
   401146,
   255525,
   401145,
   6,
   338055,
   251953,
   252233,
   254839,
   2],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 1: {'text': 'ᑕᕝᕙᐅᖏᒻᒪᑦ <__word-separator> ᑕᕝ<__morph-boundary>ᕙᐅᖏᒻᒪᑦ',
  'label': 0,
  'input_ids': [0,
   293436,
   251953,
   252233,
   254839,
   6,
   401146,
   312444,
   401145,
   6,
   252602,
   251953,
   252233,
   254839,
   2],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 2: {'text': 'ᑕᕝᕙᐅᖏᒻᒪᑦ <__word-separator> ᑕᕝᕙ<__morph-boundary>ᐅᖏᒻᒪᑦ',
  'label': 1,
  'input_ids': [0,
   293436,
   251953,
   252233,
   254839,
   6,
   401146,
   293436,
   401145,
   251593,
   252233,
   254839,
   2],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 3: {'text': 'ᑕᕝᕙᐅᖏᒻᒪᑦ <__word-separator> ᑕᕝᕙᐅ<__morph-boundary>ᖏᒻᒪᑦ',
  'label': 1,
  

In [61]:
items = {}
w = "un@bound@ed"
ws = ["un@bound@ed", "in@justice"]

for w in ws:
    segments = w.split("@")
    word = "".join(segments)
    segment_lens = [len(s) for s in segments] 
    hyphen_pos = set(itertools.accumulate(segment_lens[:-1]))
    for i in range(1, len(word)):
        text = f"{word} {word_separator_token} {word[:i]}{morph_boundary_token}{word[i:]}"
        label = int(i in hyphen_pos)
        value_dict = {"text": text, "label": label}
        value_dict.update(tokenizer(text))
        items[len(items)] = value_dict

print(w)
print(segments)
print(word)
print(segment_lens)
print(hyphen_pos)
print()

in@justice
['in', 'justice']
injustice
[2, 7]
{2}



In [62]:
items

{0: {'text': 'unbounded <__word-separator> u<__morph-boundary>nbounded',
  'label': 0,
  'input_ids': [0, 51, 99091, 297, 6, 401146, 75, 401145, 287048, 167457, 2],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 1: {'text': 'unbounded <__word-separator> un<__morph-boundary>bounded',
  'label': 1,
  'input_ids': [0, 51, 99091, 297, 6, 401146, 51, 401145, 340640, 297, 2],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 2: {'text': 'unbounded <__word-separator> unb<__morph-boundary>ounded',
  'label': 0,
  'input_ids': [0, 51, 99091, 297, 6, 401146, 51, 275, 401145, 6, 167457, 2],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 3: {'text': 'unbounded <__word-separator> unbo<__morph-boundary>unded',
  'label': 0,
  'input_ids': [0, 51, 99091, 297, 6, 401146, 51, 837, 401145, 165, 297, 2],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 4: {'text': 'unbounded <__word-separator> unbou<__morph-boundary>nded',
  'label': 0,
  'input_ids': [0,
   51,
 

In [57]:
s1 = set([0, 1])
s2 = set([0, 1, 2])

In [59]:
s1 - s2 == True

False

In [72]:
x = [i["label"] for i in items.values()]


In [73]:
x

[0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0]

In [67]:
y = np.ndarray([i["label"] for i in items.values()])

In [68]:
y

array([], shape=(0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0),
      dtype=float64)

In [69]:
s1 - set(y)

{0, 1}

In [71]:
y.shape

(0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0)

In [91]:
a = np.array([0, 1])

In [92]:
a

array([0, 1])