### Imports

In [None]:
!pip install transformers

In [25]:
from transformers import BertModel, BertTokenizer

In [None]:
model = BertModel.from_pretrained("dbmdz/bert-base-turkish-cased")

In [32]:
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")

In [107]:
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization

In [108]:
TEST_NAME = "upsampling"

In [109]:
from google.colab import drive
prefix = "/content/drive"
drive.mount(prefix)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [110]:
prefix += "/My Drive/Thesis"

In [111]:
train_file = prefix+"/"+TEST_NAME+"/train.txt"
val_file = prefix+"/"+TEST_NAME+"/dev.txt"
test_file = prefix+"/"+TEST_NAME+"/test.txt"
dataset_sentences = prefix+"/"+"parse_sentences.txt"

In [112]:
with open(dataset_sentences) as f:
  sent_dict = {}
  lines = f.read().split("\n")  
  for line in lines[:-1]:
      sentence, id = line.split("\t")
      sent_dict[id] = sentence

In [113]:
len(sent_dict)

4849

In [114]:
def get_context(raw, id):

  sentence = sent_dict[id]

  input_ids = tokenizer(sentence)["input_ids"]

  encoded_input = tokenizer(sentence, return_tensors='pt')
  output = model(**encoded_input)

  idx = 0
  for id_cand,id in enumerate(input_ids):
    if tokenizer.decode([id]) == raw[:len(tokenizer.decode([id]))]:
      idx = id_cand

  return output['last_hidden_state'][0][idx]

In [115]:
def process_dataset(data):
  with open(data) as f:
    lines = f.read().split("\n")
  text_pairs = []
  for line in lines[:-1]:
      token, morphs, id = line.split("\t")
      raw = " ".join(token) # Char split
      morphs = f"{' '.join(morphs.split('+')[0])} {' '.join(morphs.split('+')[1:])}" # Char split
      morphs = "[start] " + morphs + " [end]"
      #context = get_context(token, id)
      #text_pairs.append((raw, morphs, context))
      text_pairs.append((raw, morphs))
  return text_pairs

In [116]:
train_pairs = process_dataset(train_file)
val_pairs = process_dataset(val_file)
test_pairs = process_dataset(test_file)

In [117]:
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

62981 training pairs
9966 validation pairs
9966 test pairs


In [118]:
char_set = set()
for pair in train_pairs+val_pairs+test_pairs:
  for c in pair[0].split():
    char_set.add(c)
  for c in pair[1].split():
    char_set.add(c)

print(len(char_set))

355


## Dataset Steps


In [119]:
vocab_size = 350 #
sequence_length = 25 # This would be larger --> 20 was default
batch_size = 64

def custom_standardization_raw(input_string):
    return input_string

def custom_standardization_morph(input_string):
    return input_string

raw_vectorization = TextVectorization(
    max_tokens=vocab_size, 
    output_mode="int", 
    output_sequence_length=sequence_length, 
    standardize=custom_standardization_raw
)
morph_vectorization = TextVectorization(
    standardize=custom_standardization_morph,
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
)

train_raw_texts = [pair[0] for pair in train_pairs]
train_morph_texts = [pair[1] for pair in train_pairs]

raw_vectorization.adapt(train_raw_texts)
morph_vectorization.adapt(train_morph_texts)

In [120]:
import pickle
pickle.dump({'config': raw_vectorization.get_config(),
             'weights': raw_vectorization.get_weights()}
            , open(f"{TEST_NAME}_raw_vectorizator.pkl", "wb"))

pickle.dump({'config': morph_vectorization.get_config(),
             'weights': morph_vectorization.get_weights()}
            , open(f"{TEST_NAME}_morph_vectorizator.pkl", "wb"))

In [121]:
def format_dataset(raw, morph):
    raw = raw_vectorization(raw)
    morph = morph_vectorization(morph)
    return ({"encoder_inputs": raw, "decoder_inputs": morph[:, :-1]}, morph[:, 1:])

In [122]:
def make_dataset(pairs):
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset
    #return dataset.shuffle(2048).prefetch(16).cache()

In [123]:
train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [124]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (64, 25)
inputs["decoder_inputs"].shape: (64, 25)
targets.shape: (64, 25)


In [125]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'token_embeddings': self.token_embeddings,
            'position_embeddings': self.position_embeddings,
            'sequence_length': self.sequence_length,
            'vocab_size': self.vocab_size,
            'embed_dim': self.embed_dim,
              })
        return config

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)

## Training Step

In [128]:
vocab_size = 350
sequence_length = 25
embed_dim = 256
latent_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [None]:
epochs = 5  # This should be at least 30 for convergence

transformer.summary()
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding_10 (Posit  (None, None, 256)   96000       ['encoder_inputs[0][0]']         
 ionalEmbedding)                                                                                  
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder_5 (Transfo  (None, None, 256)   3155456     ['positional_embedding_

In [None]:
from google.colab import files
transformer.save_weights(f"{TEST_NAME}")
files.download(f"{TEST_NAME}_raw_vectorizator.pkl")
files.download(f"{TEST_NAME}_morph_vectorizator.pkl")
files.download(f"{TEST_NAME}.index")
files.download(f"{TEST_NAME}.data-00000-of-00001")

In [None]:
morph_vocab = morph_vectorization.get_vocabulary()
morph_index_lookup = dict(zip(range(len(morph_vocab)), morph_vocab))
max_decoded_sentence_length = 25

In [None]:
def decode_sequence(input_sentence):
    tokenized_input_sentence = raw_vectorization([input_sentence])
    decoded_sentence = "[start]+"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = morph_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = morph_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
          break
    return decoded_sentence

In [None]:
pos_unique = ['NOUN', 'VERB', 'PUNCT', 'ADJ', 'ADP', 'CONJ', 'DET', 'ADV', 'NUM', 'PRON', 'PRT', 'X', 'ONOM', 'AFFIX']

def normalize(input):
  pre = ""
  feats = ""
  tokens = input.split()[1:-1]
  match = False
  for idx,token in enumerate(tokens):
    if token.split("^")[0] in pos_unique and not match:
      pre += "".join(tokens[0:idx])+"+"+token+"+"
      match=True
    elif match:
      feats += token+"+"
  return (pre+feats).strip("+")

In [None]:
print(normalize(decode_sequence(test_raw_texts[0])))
print(normalize(test_morph_texts[0]))

In [None]:
import sys
test_raw_texts = [pair[0] for pair in test_pairs]
test_morph_texts = [pair[1] for pair in test_pairs]

y_true = []
y_pred = []

with open(f'{TEST_NAME}.txt', 'w+', encoding='utf-8') as f_out:
  for input_sentence, analyzed_sentence in zip(test_raw_texts,test_morph_texts):
    try:
      pred = normalize(decode_sequence(input_sentence))
      true = normalize(analyzed_sentence)

      y_true.append(true)
      y_pred.append(pred)

      f_out.write(f"{pred}\t{true}\n")
    except:
      print("Error")
      sys.exit()

files.download(f'{TEST_NAME}.txt')

In [None]:
y_true = [true.split("+") for true in y_true]
y_pred = [pred.split("+") for pred in y_pred]

In [None]:
y_true[0]

In [None]:
y_pred = [x if len(x)>1 else ["."]+x for x in y_pred]
y_true = [x if len(x)>1 else ["."]+x for x in y_true]

In [None]:
from nltk.translate.bleu_score import sentence_bleu

def calculate_BLEU():
  total_score = 0
  for pred, true in zip(y_pred,y_true):
    reference = [pred][:]
    candidate = true[:]
    score = sentence_bleu(reference, candidate)
    
    total_score += score

  return total_score/len(y_true)

In [None]:
from sklearn.metrics import accuracy_score

def calculate_lemma():

  true = [token[0] for token in y_true]
  pred = [token[0] for token in y_pred]

  return accuracy_score(true,pred)

def calculate_POS():
  true = [token[1] for token in y_true]
  pred = [token[1] for token in y_pred]

  return accuracy_score(true,pred)

In [None]:
def calculate_precision_and_recall(y_true,y_pred):

    precisions = []
    recalls = []

    for true,pred in zip(y_true,y_pred):
        try:
            max_precision = 0
            max_recall = 0

            x = true
            y = pred

            if len(x) > 0 and len(y) > 0:
                shared_items = dict()
                for k in x:
                    if (k in y) and (x.index(k) == y.index(k)):
                        shared_items[k] = x.index(k)

                recall = len(shared_items)/len(x)
                precision = len(shared_items)/len(y)

                if precision > max_precision:
                    max_precision = precision

                if recall > max_recall:
                    max_recall = recall

            precisions.append(max_precision)
            recalls.append(max_recall)

        except KeyError:
            precisions.append(0)
            recalls.append(0)


    sum = 0
    for item in precisions:
        sum += item

    average_precision = sum/len(precisions)

    sum = 0

    for item in recalls:
        sum += item

    average_recall = sum/len(precisions)

    
    return average_precision, average_recall

In [None]:
bleu_score = calculate_BLEU()
precision, recall = calculate_precision_and_recall(y_true,y_pred)
pos_score = calculate_POS()
lemma_score = calculate_lemma()

## Baseline

In [106]:
print("Precision:", precision)
print("Recall:", recall)
print("BLEU Score:", bleu_score)
print("POS Accuracy:", pos_score)
print("Lemma Accuracy:", lemma_score)

Precision: 0.7837730515107862
Recall: 0.7844898273845623
BLEU Score: 0.81305067886416
POS Accuracy: 0.8554083885209713
Lemma Accuracy: 0.8404575556893438


## Upsampling

In [None]:
print("Precision:", precision)
print("Recall:", recall)
print("BLEU Score:", bleu_score)
print("POS Accuracy:", pos_score)
print("Lemma Accuracy:", lemma_score)