### Imports

In [None]:
!pip install transformers

In [None]:
from transformers import BertModel, BertTokenizer

In [None]:
model = BertModel.from_pretrained("dbmdz/bert-base-turkish-cased")

In [None]:
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")

In [1]:
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization

In [2]:
with tf.device("gpu:0"):
   print("tf.keras code in this scope will run on GPU")

tf.keras code in this scope will run on GPU


In [3]:
TEST_NAME = "upsampling"

In [4]:
from google.colab import drive
prefix = "/content/drive"
drive.mount(prefix)

Mounted at /content/drive


In [5]:
prefix += "/My Drive/Thesis"

In [6]:
train_file = prefix+"/"+TEST_NAME+"/train.txt"
val_file = prefix+"/"+TEST_NAME+"/dev.txt"
test_file = prefix+"/"+TEST_NAME+"/test.txt"
dataset_sentences = prefix+"/"+"parse_sentences.txt"

In [7]:
with open(dataset_sentences) as f:
  sent_dict = {}
  lines = f.read().split("\n")  
  for line in lines[:-1]:
      sentence, id = line.split("\t")
      sent_dict[id] = sentence

In [8]:
len(sent_dict)

4849

In [9]:
def get_context(raw, id):

  sentence = sent_dict[id]

  input_ids = tokenizer(sentence)["input_ids"]

  encoded_input = tokenizer(sentence, return_tensors='pt')
  output = model(**encoded_input)

  idx = 0
  for id_cand,id in enumerate(input_ids):
    if tokenizer.decode([id]) == raw[:len(tokenizer.decode([id]))]:
      idx = id_cand

  return output['last_hidden_state'][0][idx]

In [10]:
def process_dataset(data):
  with open(data) as f:
    lines = f.read().split("\n")
  text_pairs = []
  for line in lines:
    if not(line):
      continue
    try:
      token, morphs, id = line.split("\t")
    except:
      token, morphs = line.split("\t")
    raw = " ".join(token) # Char split
    morphs = f"{' '.join(morphs.split('+')[0])} {' '.join(morphs.split('+')[1:])}" # Char split
    morphs = "[start] " + morphs + " [end]"
    #context = get_context(token, id)
    #text_pairs.append((raw, morphs, context))
    text_pairs.append((raw, morphs))
  return text_pairs

In [11]:
train_pairs = process_dataset(train_file)
val_pairs = process_dataset(val_file)
test_pairs = process_dataset(test_file)

In [12]:
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

56771 training pairs
9966 validation pairs
9966 test pairs


In [13]:
char_set = set()
for pair in train_pairs+val_pairs+test_pairs:
  for c in pair[0].split():
    char_set.add(c)
  for c in pair[1].split():
    char_set.add(c)

print(len(char_set))

315


In [14]:
def get_max_seq_length(pairs):
    return max(max([len(token[0].split()[1:-1]) for token in pairs]),
               max([len(token[0].split()[1:-1]) for token in pairs]))

In [15]:
max_seq_length = max(get_max_seq_length(train_pairs),
                     get_max_seq_length(val_pairs),
                     get_max_seq_length(test_pairs))

## Dataset Steps


In [16]:
vocab_size = len(char_set)
sequence_length = max_seq_length
batch_size = 64

def custom_standardization_raw(input_string):
    return input_string

def custom_standardization_morph(input_string):
    return input_string

raw_vectorization = TextVectorization(
    max_tokens=vocab_size, 
    output_mode="int", 
    output_sequence_length=sequence_length, 
    standardize=custom_standardization_raw
)
morph_vectorization = TextVectorization(
    standardize=custom_standardization_morph,
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
)

train_raw_texts = [pair[0] for pair in train_pairs]
train_morph_texts = [pair[1] for pair in train_pairs]

raw_vectorization.adapt(train_raw_texts)
morph_vectorization.adapt(train_morph_texts)

In [20]:
import pickle
pickle.dump({'config': raw_vectorization.get_config(),
             'weights': raw_vectorization.get_weights()}
            , open(f"{TEST_NAME}_raw_vectorizator.pkl", "wb"))

pickle.dump({'config': morph_vectorization.get_config(),
             'weights': morph_vectorization.get_weights()}
            , open(f"{TEST_NAME}_morph_vectorizator.pkl", "wb"))

In [21]:
def format_dataset(raw, morph):
    raw = raw_vectorization(raw)
    morph = morph_vectorization(morph)
    return ({"encoder_inputs": raw, "decoder_inputs": morph[:, :-1]}, morph[:, 1:])

In [22]:
def make_dataset(pairs):
    raw_texts, morph_texts = zip(*pairs)
    raw_texts = list(raw_texts)
    morph_texts = list(morph_texts)
    dataset = tf.data.Dataset.from_tensor_slices((raw_texts, morph_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    if TEST_NAME == "feat_dist":
      return dataset.shuffle(2048).prefetch(16).cache()
    return dataset

In [23]:
train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [24]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (64, 26)
inputs["decoder_inputs"].shape: (64, 26)
targets.shape: (64, 26)


In [25]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'token_embeddings': self.token_embeddings,
            'position_embeddings': self.position_embeddings,
            'sequence_length': self.sequence_length,
            'vocab_size': self.vocab_size,
            'embed_dim': self.embed_dim,
              })
        return config

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)

## Training Step

In [26]:
embed_dim = 256
latent_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [28]:
epochs = 5  # This should be at least 30 for convergence

transformer.summary()
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding_2 (Positi  (None, None, 256)   87296       ['encoder_inputs[0][0]']         
 onalEmbedding)                                                                                   
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder_1 (Transfo  (None, None, 256)   3155456     ['positional_embedding_

<keras.callbacks.History at 0x7f9252adeed0>

In [29]:
from google.colab import files
transformer.save_weights(f"{TEST_NAME}")
files.download(f"{TEST_NAME}_raw_vectorizator.pkl")
files.download(f"{TEST_NAME}_morph_vectorizator.pkl")
files.download(f"{TEST_NAME}.index")
files.download(f"{TEST_NAME}.data-00000-of-00001")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [30]:
morph_vocab = morph_vectorization.get_vocabulary()
morph_index_lookup = dict(zip(range(len(morph_vocab)), morph_vocab))
max_decoded_sentence_length = 25

In [31]:
def decode_sequence(input_sentence):
    tokenized_input_sentence = raw_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = morph_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = morph_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
          break
    return decoded_sentence

In [32]:
pos_unique = ['NOUN', 'VERB', 'PUNCT', 'ADJ', 'CONJ^Coor', 'ADP^CBare', 'NUM', 'ADV',
       'DET^Ind', 'PRT', 'PRON', 'DET^Dem', 'NOUN^Temporal', 'NOUN^PROP', '^PRON',
       'ADP^CDat', 'X', 'CONJ^Adv', 'DET^Def', 'DET', 'ADP^CAbl', 'ADP^CGen',
       'ADP^CIns', 'CONJ^Par', 'ADV^Temporal', 'ADP^CNum', 'ADP^CFin',
       'CONJ^Sub', 'ONOM']

In [33]:
def normalize(input):
  pre = ""
  feats = ""
  tokens = input.split()[1:-1]
  match = False
  for idx,token in enumerate(tokens):
    if token in pos_unique and not match:
      pre += "".join(tokens[0:idx])+"+"+token+"+"
      match=True
    elif match:
      feats += token+"+"
  return (pre+feats).strip("+")

In [34]:
import sys
test_raw_texts = [pair[0] for pair in test_pairs]
test_morph_texts = [pair[1] for pair in test_pairs]

In [35]:
test_morph_texts[0]

'[start] t h y n NOUN Apostrophe=True PersonNumber=A3pl Possessive=Pnon Case=Nom Proper=True [end]'

In [36]:
idx = 20
print(normalize(test_morph_texts[idx]))
print(test_pairs[idx])

elçi+NOUN+PersonNumber=A3sg+Possessive=Pnon+Case=Bare+Proper=True+DB^NOUN+Derivation=Ness+PersonNumber=A3sg+Possessive=P3sg+Case=Nom+Proper=True
('e l ç i l i ğ i', '[start] e l ç i NOUN PersonNumber=A3sg Possessive=Pnon Case=Bare Proper=True DB^NOUN Derivation=Ness PersonNumber=A3sg Possessive=P3sg Case=Nom Proper=True [end]')


In [37]:
from tqdm.notebook import tqdm_notebook

y_true = []
y_pred = []

with open(f'{TEST_NAME}.txt', 'w+', encoding='utf-8') as f_out:
  for i in tqdm_notebook(range(max(len(test_raw_texts),len(test_morph_texts)))):
    try:
      pred = normalize(decode_sequence(test_raw_texts[i]))
      true = normalize(test_morph_texts[i])

      y_true.append(true)
      y_pred.append(pred)

      f_out.write(f"{pred}\t{true}\n")
    except:
      print("Error")
      sys.exit()

files.download(f'{TEST_NAME}.txt')

  0%|          | 0/9966 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [38]:
y_true = [true.split("+") for true in y_true]
y_pred = [pred.split("+") for pred in y_pred]

In [None]:
y_true[0]

In [40]:
y_pred = [x if len(x)>1 else ["."]+x for x in y_pred]
y_true = [x if len(x)>1 else ["."]+x for x in y_true]

In [41]:
from nltk.translate.bleu_score import sentence_bleu

def calculate_BLEU():
  total_score = 0
  for pred, true in zip(y_pred,y_true):
    reference = [pred][:]
    candidate = true[:]
    score = sentence_bleu(reference, candidate)
    
    total_score += score

  return total_score/len(y_true)

In [42]:
from sklearn.metrics import accuracy_score

def calculate_lemma():

  true = [token[0] for token in y_true]
  pred = [token[0] for token in y_pred]

  return accuracy_score(true,pred)

def calculate_POS():
  true = [token[1] for token in y_true]
  pred = [token[1] for token in y_pred]

  return accuracy_score(true,pred)

In [43]:
def calculate_precision_and_recall(y_true,y_pred):

    precisions = []
    recalls = []

    for true,pred in zip(y_true,y_pred):
        try:
            max_precision = 0
            max_recall = 0

            x = true
            y = pred

            if len(x) > 0 and len(y) > 0:
                shared_items = dict()
                for k in x:
                    if (k in y) and (x.index(k) == y.index(k)):
                        shared_items[k] = x.index(k)

                recall = len(shared_items)/len(x)
                precision = len(shared_items)/len(y)

                if precision > max_precision:
                    max_precision = precision

                if recall > max_recall:
                    max_recall = recall

            precisions.append(max_precision)
            recalls.append(max_recall)

        except KeyError:
            precisions.append(0)
            recalls.append(0)


    sum = 0
    for item in precisions:
        sum += item

    average_precision = sum/len(precisions)

    sum = 0

    for item in recalls:
        sum += item

    average_recall = sum/len(precisions)

    
    return average_precision, average_recall

In [44]:
bleu_score = calculate_BLEU()
precision, recall = calculate_precision_and_recall(y_true,y_pred)
pos_score = calculate_POS()
lemma_score = calculate_lemma()

Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


## Baseline

In [46]:
print("Precision:", precision)
print("Recall:", recall)
print("BLEU Score:", bleu_score)
print("POS Accuracy:", pos_score)
print("Lemma Accuracy:", lemma_score)

Precision: 0.7837730515107862
Recall: 0.7844898273845623
BLEU Score: 0.81305067886416
POS Accuracy: 0.8554083885209713
Lemma Accuracy: 0.8404575556893438


## Phonological Normalization

In [None]:
print("Precision:", precision)
print("Recall:", recall)
print("BLEU Score:", bleu_score)
print("POS Accuracy:", pos_score)
print("Lemma Accuracy:", lemma_score)

Precision: 0.839238831598399
Recall: 0.8384282312829477
BLEU Score: 0.8603016822291351
POS Accuracy: 0.8866144892634958
Lemma Accuracy: 0.9408990567930965


## Feature Normalization

In [None]:
print("Precision:", precision)
print("Recall:", recall)
print("BLEU Score:", bleu_score)
print("POS Accuracy:", pos_score)
print("Lemma Accuracy:", lemma_score)

Precision: 0.7846768618141273
Recall: 0.7855815146679123
BLEU Score: 0.8338185955282936
POS Accuracy: 0.8535019064820389
Lemma Accuracy: 0.8520971302428256


## Feature Distribution

In [None]:
print("Precision:", precision)
print("Recall:", recall)
print("BLEU Score:", bleu_score)
print("POS Accuracy:", pos_score)
print("Lemma Accuracy:", lemma_score)

Precision: 0.5920176086009074
Recall: 0.583141563153986
BLEU Score: 0.6326301739790313
POS Accuracy: 0.7238611278346377
Lemma Accuracy: 0.6385711418824002


## Upsampling

In [45]:
print("Precision:", precision)
print("Recall:", recall)
print("BLEU Score:", bleu_score)
print("POS Accuracy:", pos_score)
print("Lemma Accuracy:", lemma_score)

Precision: 0.40824612655102005
Recall: 0.3722607563853567
BLEU Score: 0.6904352492158221
POS Accuracy: 0.31015452538631344
Lemma Accuracy: 0.5749548464780253


## All Process

In [None]:
print("Precision:", precision)
print("Recall:", recall)
print("BLEU Score:", bleu_score)
print("POS Accuracy:", pos_score)
print("Lemma Accuracy:", lemma_score)

Precision: 0.8556059411621291
Recall: 0.8584968944917806
BLEU Score: 0.891825792284436
POS Accuracy: 0.8385510736504114
Lemma Accuracy: 0.9435079269516355
