# Imports, dataset

In [42]:
#General data packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
#Deep learning
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras

In [17]:
full_fems = pd.read_csv("path/to/data")
full_fems = shuffle(full_fems)  # Shuffle dataset before train/dev/test split
full_fems # Example words in dataset

Unnamed: 0,forma_zenska,forma_meska
5186,kochanka,kochanek
782,egzorcystka,egzorcysta
607,apiterapeutka,apiterapeuta
3692,zawadczanka,zawadczanin
641,azjatka,azjata
...,...,...
2354,bialanka,bialanin
159,modniarka,modniarz
4250,realizatorka,realizator
21,bursztyniarka,bursztyniarz


# Preparing data

In [18]:
# We need to add additional charater on the beginning and end of each female form
input_forms = [word for word in full_fems.forma_meska]
target_forms = [ "\t" + word + "\n" for word in full_fems.forma_zenska]

In [19]:
#Each character is represented as one-hot vector
tokenizer_input = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer_input.fit_on_texts(input_forms)

data_input = tokenizer_input.texts_to_sequences(input_forms)
data_input = keras.preprocessing.sequence.pad_sequences(data_input, padding="post")

tokenizer_output = keras.preprocessing.text.Tokenizer(char_level=True, lower=True)
tokenizer_output.fit_on_texts(target_forms)

data_output = tokenizer_input.texts_to_sequences(target_forms)
data_output = keras.preprocessing.sequence.pad_sequences(data_output, padding="post")

In [20]:
tokenizer_input.word_index.keys()

dict_keys(['a', 'i', 'n', 'o', 'r', 't', 'e', 'z', 's', 'k', 'c', 'y', 'l', 'w', 'p', 'm', 'd', 'u', 'b', 'g', 'j', 'h', 'ł', 'f', 'ń', 'ż', 'ó', 'ś', 'ę', 'ą', 'ź', 'ć'])

In [21]:
input_seq_len = max([len(word) for word in input_forms])
input_characters = tokenizer_input.word_index.keys()

output_seq_len = max([len(word) for word in target_forms])
output_characters = tokenizer_output.word_index.keys()

I have only ~ 6000 pairs so I put about 10% to testset

In [22]:
input_test = input_forms[-600:]
input_forms = input_forms[:-600]

target_forms_test = target_forms[-600:]
target_forms = target_forms[:-600]

Scheme for representing data

In [23]:
encoder_input_data = np.zeros((len(input_forms), input_seq_len, len(input_characters) + 1))
encoder_input_data_test = np.zeros((len(input_test), input_seq_len, len(input_characters) + 1))

decoder_input_data = np.zeros((len(target_forms), output_seq_len, len(output_characters) +1 ))
decoder_target_data = np.zeros((len(target_forms), output_seq_len, len(output_characters) +1 ))

In [24]:
for i, (input_form, target_form) in enumerate(zip(input_forms, target_forms)):
  for t, char in enumerate(input_form):
    encoder_input_data[i, t, tokenizer_input.word_index[char]] = 1.
  for t, char in enumerate(target_form):
    decoder_input_data[i, t, tokenizer_output.word_index[char]] = 1.
    if t > 0:
      decoder_target_data[i, t - 1, tokenizer_output.word_index[char]] = 1.

In [25]:
for i, input_form in enumerate(input_test):
  for t, char in enumerate(input_form):
    encoder_input_data_test[i, t, tokenizer_input.word_index[char]] = 1.

# Building model

In [26]:
class WordAccuracy(keras.metrics.Metric):

  '''
  Purpose of metric is to checkout how many words are 
  well-predicted from begining to end. Categorical Accuracy just gives us
  information about how many charaters are well-predicted, but when it comes
  to zeros in padding the model do not even learns to predict them,
  so default "accuracy" can be misleading.
  '''

  def __init__(self, name='word_accuracy', **kwargs):
    super(WordAccuracy, self).__init__(name=name, **kwargs)
    self.total = self.add_weight("total", initializer="zeros")
    self.count = self.add_weight("total", initializer="zeros")

  @tf.function
  def update_state(self, y_true, y_pred, sample_weight=None):

    acc = 0

    # For each word in batch
    for i in range(tf.shape(y_true)[0]):
    # Check where max == 1 - how long word is and how long padding is.
      positions = tf.keras.backend.max(y_true[i], axis=1) == 1   
    # Take out all maxes from predicted values.
      pred = tf.argmax(y_pred[i], axis=1)       
    # Take out all maxes from true values.
      true = tf.argmax(y_true[i], axis=1)      
    # If all important positions are same model predicted succesfully whole word.
      if tf.keras.backend.all(true[positions] == pred[positions]):
         acc += 1

    acc = tf.cast(acc, dtype="float32")

    self.total.assign_add(acc)
    self.count.assign_add(tf.cast(tf.shape(y_true)[0], dtype="float32"))

  def result(self):
    return self.total / self.count

  def reset_states(self):
    self.total.assign(0)
    self.count.assign(0)

In [27]:
batch_size = 32
epochs = 100

encoder_input_train, encoder_input_val, decoder_input_train, \
decoder_input_val, decoder_target_train, decoder_target_val = \
train_test_split(encoder_input_data, decoder_input_data, decoder_target_data, test_size = .12)

train_dataset = tf.data.Dataset.from_tensor_slices((encoder_input_train, decoder_input_train, decoder_target_train))
train_dataset = train_dataset.shuffle(buffer_size=len(encoder_input_train)).batch(batch_size)

val_dataset = tf.data.Dataset.from_tensor_slices((encoder_input_val, decoder_input_val, decoder_target_val))
val_dataset = val_dataset.batch(batch_size)

In [28]:
dimensions = [1024, 256]

#Encoder architecture
encoder_inputs = keras.layers.Input(shape=(None, len(input_characters) + 1))
masked_input = keras.layers.Masking()(encoder_inputs)

encoder_lstm1 = keras.layers.LSTM(dimensions[0], return_state=True, return_sequences=True)
encoder_outputs, h1, c1 = encoder_lstm1(masked_input)

encoder_lstm2 = keras.layers.LSTM(dimensions[1], return_state=True)
_, h2, c2 = encoder_lstm2(encoder_outputs)

encoder_states = [h1, c1, h2, c2]

#Decoder architecture
decoder_inputs = keras.layers.Input(shape=(None, len(output_characters) +1))
masked_inputs = keras.layers.Masking()(decoder_inputs)

decoder_lstm1 = keras.layers.LSTM(dimensions[0], return_sequences=True, return_state=True)
decoder_outputs, dh1, dc1 = decoder_lstm1(masked_inputs, initial_state = [h1, c1])

decoder_lstm2 = keras.layers.LSTM(dimensions[1], return_sequences=True, return_state=True)
decoder_outputs_2, dh2, dc2 = decoder_lstm2(decoder_outputs, initial_state = [h2, c2])

deep_dense = keras.layers.Dense(dimensions[1], activation = "tanh")
deep_out = deep_dense(decoder_outputs_2)

out_dense = keras.layers.Dense(len(output_characters) + 1, activation='softmax')
outputs = out_dense(deep_out)

#Full model
feminitizer = keras.models.Model(inputs=[encoder_inputs, decoder_inputs], 
              outputs=outputs)

feminitizer.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 33)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 35)]   0                                            
__________________________________________________________________________________________________
masking (Masking)               (None, None, 33)     0           input_1[0][0]                    
__________________________________________________________________________________________________
masking_1 (Masking)             (None, None, 35)     0           input_2[0][0]                    
______________________________________________________________________________________________

In [53]:
@tf.function
def train_on_batch(X1, X2, y):
  with tf.GradientTape() as tape:

    y_pred = feminitizer([X1, X2], training=True)
    main_loss = tf.reduce_mean(loss_fn(y, y_pred))
    loss = tf.add_n([main_loss] + feminitizer.losses)
    train_loss.append(loss)

  gradients = tape.gradient(loss, feminitizer.trainable_variables)
  optimizer.apply_gradients(zip(gradients, feminitizer.trainable_variables))
  mean_loss(loss)

  # Update metrics
  for i, metric in enumerate(metrics):
    metric(y, y_pred)

@tf.function
def valid_on_batch(X1, X2, y):

  y_val_pred = feminitizer([X1, X2], training=False)
  main_loss_val = tf.reduce_mean(loss_fn(y, y_val_pred))
  validation_loss.append(main_loss_val)
  mean_loss_val(main_loss_val)

  # Update metrics
  for i, metric in enumerate(val_metrics):
    metric(y, y_val_pred)


In [54]:
#Custom progress bar for monitoring metrics
def print_status_bar_val(iteration, total, loss, metrics=None):
  metrics = " - ".join(["{}: {:.4f}".format(m.name, m.result())
  for m in [loss] + (metrics or [])])
  end = "" if iteration < total else "\n"
  print("\r{}/{} - ".format(iteration, total) + metrics,
  end=end)

def print_status_bar(iteration, total, loss, metrics=None):
  metrics = " - ".join(["{}: {:.4f}".format(m.name, m.result())
  for m in [loss] + (metrics or [])])
  end = "" if iteration < total else "\n"
  print("\r{}/{} - ".format(iteration, total) + metrics,
  end=end)

Training loop

In [None]:
# Value to compare with
best_word_acc = tf.Variable(.8)

# After experimeting with diffrent loss functions and optimizers those 
loss_fn = keras.losses.categorical_crossentropy
optimizer = tfa.optimizers.LazyAdam()

mean_loss = keras.metrics.Mean()
mean_loss_val = keras.metrics.Mean()

metrics = [keras.metrics.CategoricalAccuracy(), WordAccuracy()]
val_metrics = [keras.metrics.CategoricalAccuracy(), WordAccuracy()]

for epoch in range(epochs):
    print(f"\nEpoch {epoch}/{epochs}")
    print("\nTraining set")
  
    # Training
    for step, (encoder_batch, decoder_batch, y_batch) in enumerate(train_dataset):
      train_on_batch(encoder_batch, decoder_batch, y_batch)

      print_status_bar(step * batch_size, len(encoder_input_train), mean_loss, metrics)


    print("\nValidation set:")

    # Validation
    for step_val, (encoder_batch_val, decoder_batch_val, target_batch_val) in enumerate(val_dataset):
      valid_on_batch(encoder_batch_val, decoder_batch_val, target_batch_val)

    # Save the best model (model with highest validation word accuracy)
    if val_metrics[1].result() > best_word_acc:
      feminitizer.save('femz.h5')
      best_word_acc = val_metrics[1].result()

    print_status_bar_val(len(encoder_input_val), len(encoder_input_val), mean_loss_val, val_metrics)

    # Reset metrics
    for metric in [mean_loss] + metrics:
      metric.reset_states()
    for metric in [mean_loss_val] + val_metrics:
      metric.reset_states()

# Building model and functions for new predictions

Let's use the best saved model to make new predictions aka feminatives

In [43]:
feminitizer_pred = keras.models.load_model("/content/femz.h5")

# We need to take out encoder internal states
_, h1, c1 = feminitizer_pred.layers[4].output
_, h2, c2 = feminitizer_pred.layers[6].output



In [44]:
#Encoder model for prediction
encoder_predictor = keras.models.Model(feminitizer_pred.input[0], [h1,c1, h2, c2])

#Decoder model for prediction
decoder_state_input_h1 = keras.layers.Input(shape=(dimensions[0],))
decoder_state_input_c1 = keras.layers.Input(shape=(dimensions[0],))
decoder_state_input_h2 = keras.layers.Input(shape=(dimensions[1],))
decoder_state_input_c2 = keras.layers.Input(shape=(dimensions[1],))
decoder_states_inputs = [decoder_state_input_h1, decoder_state_input_c1, 
                         decoder_state_input_h2, decoder_state_input_c2]


d_o, state_h1, state_c1 = feminitizer_pred.layers[5](feminitizer_pred.input[1], initial_state=decoder_states_inputs[:2])

d_o, state_h2, state_c2 = feminitizer_pred.layers[7](d_o, initial_state=decoder_states_inputs[-2:])

decoder_states = [state_h1, state_c1, state_h2, state_c2]

deep_out = feminitizer_pred.layers[8](d_o)
decoder_outputs = feminitizer_pred.layers[9](deep_out)

# decoder_outputs = decoder_dense(output)
decoder_predictor = keras.models.Model(
    [feminitizer_pred.input[1]] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

decoder_predictor.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None, 35)]   0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 1024)]       0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1024)]       0                                            
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 1024), 4341760     input_2[0][0]                    
                                                                 input_3[0][0]              

In [45]:
# Save both models for future uses
encoder_predictor.save("encoder_predict.h5")

decoder_predictor.save("decoder_predict.h5")

In [None]:
encoder_pred = keras.models.load_model('encoder_predict.h5')

decoder_pred = keras.models.load_model('decoder_predict.h5')

In [47]:
def decode_sequence(input_seq):
  # encode the input sequence to get the internal state vectors.
  states_value = encoder_pred.predict(input_seq)
  
  # generate empty target sequence of length 1 with only the start character
  target_seq = np.zeros((1, 1, len(output_characters)+1))
  target_seq[0, 0, tokenizer_output.word_index['\t']] = 1.
  encoded_word = np.array([])


  # loop for producing feminative
  stop_condition = False
  predicted_feminative = ''
  while not stop_condition:
    output_tokens, h1, c1, h2, c2 = decoder_pred.predict(
            [target_seq] + states_value)
    
    # add token to predicted word
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_char = tokenizer_output.index_word[sampled_token_index]
    predicted_feminative += sampled_char
    
    # if word is too long or next predicted character is "\n" stop predicting
    if (sampled_char == '\n' or len(predicted_feminative) > output_seq_len):
      stop_condition = True
      
    # update target
    target_seq = np.zeros((1, 1, len(output_characters) +1))
    target_seq[0, 0, sampled_token_index] = 1
    
    # update states
    states_value = [h1, c1, h2, c2]
    

  return predicted_feminative

# Enjoing results

In [48]:
fails = 0

for seq_index in range(600):
    input_seq = encoder_input_data_test[seq_index: seq_index + 1]
    decoded_sentecne = decode_sequence(input_seq)
    if decoded_sentecne.strip() != target_forms_test[seq_index].strip():
      fails += 1

print(fails/600)

0.145


Error rate on test set is very similiar to validation set error

# Custom feminization

Function to generate custom feminatives

In [50]:
def feminatize(words):
  out_string = ""
  words = words.split()
  user_array = np.zeros((len(words), input_seq_len, len(input_characters) + 1))
  for i, input_form in enumerate(user_array):
    for t, char in enumerate(words[i].lower()):
      user_array[i, t, tokenizer_input.word_index[char]] = 1.

  for seq_index in range(len(user_array)):
    input_seq = user_array[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)

    out_string += decoded_sentence.strip()
    out_string += ", "

  out_string = out_string[:-2]
  return out_string

In [52]:
feminatize("informatyk poseł wariat niedźwiedź")

'informatyczka, posłanka, wariatka, niedźwiedzica'