In [293]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import pandas as pd

import align
import argparse
import codecs
import os, sys
from random import random, choice
import re
import glob
import matplotlib.pyplot as plt

from IPython.utils import io

In [364]:
def read_data(filename):
    with codecs.open(filename, 'r', 'utf-8') as inp:
        lines = inp.readlines()
    inputs = []
    outputs = []
    tags = []
    for l in lines:
        l = l.strip().split('\t')
        if l:
            inputs.append(list(l[0].strip()))
            outputs.append(list(l[1].strip()))
            tags.append(re.split('\W+', l[2].strip()))
    return inputs, outputs, tags

def find_good_range(a,b):
	mask = [(a[i]==b[i] and a[i] != u" ") for i in range(len(a))]
	if sum(mask) == 0:
		# Some times the alignment is off-by-one
		b = [' '] + b
		mask = [(a[i]==b[i] and a[i] != u" ") for i in range(len(a))]
	ranges = []
	prev = False
	for i,k in enumerate(mask):
		if k and prev:
			prev = True
		elif k and not prev:
			start = i
			prev = True
		elif prev and not k:
			end = i
			ranges.append((start, end))
			prev = False
		elif not prev and not k:
			prev = False
	if prev:
		ranges.append((start,i+1))
	ranges = [c for c in ranges if c[1]-c[0]>2]
	return ranges
def generate_stem():
	return "___"

def get_chars(l):
    flat_list = [char for word in l for char in word]
    return list(set(flat_list))
def best_range(ranges):
    longest_length = 0
    longest_index = 0
    for i,r in enumerate(ranges):
        length = r[1] - r[0]
        if length > longest_length:
            longest_length = length
            longest_index = i
    return ranges[i]

def augment(input_path):
    inputs,outputs,tags = np.array(read_data(input_path), dtype=object)
    temp = [(''.join(inputs[i]), ''.join(outputs[i])) for i in range(len(outputs))]
    aligned = align.Aligner(temp).alignedpairs
    vocab = list(get_chars(inputs + outputs))
    try:
        vocab.remove(u" ")
    except:
        pass

    new_inputs = []
    new_outputs = []
    new_tags = []
    for k,item in enumerate(aligned):
        i,o = item[0],item[1]
        good_range = find_good_range(i, o)
        if good_range:
            new_i, new_o = list(i), list(o)
            r = best_range(good_range)
            s = r[0]
            e = r[1]
            if (e-s>5): #arbitrary value
                s += 1
                e -= 1
            new_stem = generate_stem()
            new_i[s:e] = new_stem
            new_o[s:e] = new_stem
            new_i1 = [c for l,c in enumerate(new_i) if (c.strip() or (new_o[l]==' ' and new_i[l] == ' '))]
            new_o1 = [c for l,c in enumerate(new_o) if (c.strip() or (new_i[l]==' ' and new_o[l] == ' '))]
            new_inputs.append(new_i1)
            new_outputs.append(new_o1)
            new_tags.append(tags[k])
        else:
            new_inputs.append([])
            new_outputs.append([])
            new_tags.append([])
    return new_inputs, new_outputs, new_tags

def find_stems(input_path):
    inputs,outputs,tags = np.array(read_data(input_path), dtype=object)
    temp = [(''.join(inputs[i]), ''.join(outputs[i])) for i in range(len(outputs))]

    with io.capture_output() as captured:
        aligned = align.Aligner(temp).alignedpairs

    vocab = list(get_chars(inputs + outputs))
    try:
        vocab.remove(u" ")
    except:
        pass

    stems = []
    for k,item in enumerate(aligned):
        i,o = item[0],item[1]
        good_range = find_good_range(i, o)
        if good_range:
            r = best_range(good_range)
            s = r[0]
            e = r[1]
            if (e-s>5): #arbitrary value
                s += 1
                e -= 1
            stem = o[s:e]
            stems.append(stem)
    return stems

def get_vocab(strings):
    return sorted(list(get_chars(strings)))

def enumerate_sequence_characters(sequences, vocab):
    lut = {"0":0}
    count = 1
    for character in vocab:
        if character != "0":
            lut[character] = count
            count += 1
    new_sequences = []
    for sequence in sequences:
        new_sequences.append([lut[char] for char in sequence])
    return np.array(new_sequences)

def one_hot_encode_sequence(sequences, vocab):
    length = len(vocab)
    lut = {"0":0}
    count = 1
    for character in vocab:
        if character != "0":
            lut[character] = count
            count += 1
    new_sequences = []
    for sequence in sequences:
        new_sequences.append([[0] * lut[char] + [1] + [0] * ((length - lut[char]) - 1) for char in sequence])
    return np.array(new_sequences)

def get_stem_data(language, set_type="train", hilo=None, data_dir="sigmorphon_data"):
    if hilo is None:
        if f'{data_dir}/{language}-{set_type}-high' in glob.glob(f'{data_dir}/{language}-{set_type}-*'):
            hilo = "high"
        else:
            hilo = "low"
    dpath = f'{data_dir}/{language}-{set_type}-{hilo}'
    max_stem_length = 20
    padded_stems = sequence.pad_sequences(find_stems(dpath), dtype=str, maxlen=max_stem_length, padding="post", truncating="post")
    return padded_stems

def clean_stems(affirmative_stems, negative_stems):
    excluded_chars = [",","'","/","*","-","1","2","3","4","5","6","7","8","9"]
    affirmative_vocab = get_vocab(affirmative_stems)
    to_be_excluded = []
    for i,stem in enumerate(negative_stems):
        for char in stem:
            print(affirmative_vocab)
            if char not in affirmative_vocab or char in excluded_chars:
                to_be_excluded.append(i)
                break
    new_negative_stems = []
    new_affirmative_stems = []
    for i,stem in enumerate(negative_stems):
        if i not in to_be_excluded:
            new_negative_stems.append([character.lower() for character in stem])
    for i,stem in enumerate(affirmative_stems):
        reject = False
        for char in stem:
            if char in excluded_chars:
                reject = True
        if not reject:
            new_affirmative_stems.append([character.lower() for character in stem])
    return new_affirmative_stems, new_negative_stems

def create_stem_dataset(reference_language, other_languages):
    reference_stems = get_stem_data(reference_language)
    other_stems = np.concatenate([get_stem_data(language) for language in other_languages])
    cleaned = clean_stems(reference_stems, other_stems)
    reference_stems = cleaned[0]
    other_stems = cleaned[1]
    combined_stems = np.concatenate([reference_stems, other_stems])
    combined_vocab = get_vocab(combined_stems)
    print(combined_vocab)
    X = one_hot_encode_sequence(combined_stems, combined_vocab)
    labels = np.array([1] * len(reference_stems) + [0] * len(other_stems))
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.33)
    return X_train, X_test, y_train, y_test, combined_vocab


In [365]:
X_train, X_test, y_train, y_test, X_vocab = create_stem_dataset("english", ["basque","french"])

In [319]:
class StemGenerator(tf.keras.Model):
  def __init__(self, vocab):
    super().__init__(self)
    self.vocab = vocab
    vocab_len = len(vocab)
    self.embedding = tf.keras.layers.Embedding(vocab_len, 8)
    self.gru = tf.keras.layers.GRU(100,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_len)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

def StemDiscriminator():
    model = keras.Sequential()
    model.add(layers.Dense(65))
    model.add(layers.LSTM(100))
    model.add(layers.Dense(1, activation='sigmoid'))
    return model

In [328]:
generator = StemGenerator(X_vocab)

noise = tf.random.normal([1, 100])
generated = generator(noise, training=False)

print(X_vocab[np.argmax(generated[0,0,:])])

/


In [331]:
X_vocab

["'é'",
 "'o'",
 "['y'",
 "'a'",
 "'u'",
 "'x'",
 "'m'",
 'G',
 "['b'",
 "'w'",
 '/',
 "['x'",
 "['v'",
 "['g'",
 'r',
 't',
 "['é'",
 '*',
 'i',
 "'t'",
 'L',
 'o',
 'U',
 "['d'",
 'w',
 'M',
 "['q'",
 "'d'",
 's',
 "['p'",
 "['æ'",
 "['a'",
 'm',
 "'ë'",
 'u',
 "'h'",
 'e',
 "['h'",
 "['n'",
 'x',
 'l',
 'j',
 "'k'",
 'ë',
 'z',
 'ö',
 "['k'",
 "['m'",
 '0',
 "['f'",
 'y',
 "'s'",
 "['l'",
 "'v'",
 "'æ'",
 'd',
 'n',
 "['o'",
 'q',
 "'",
 "'r'",
 "['w'",
 "'0']",
 "'g'",
 'v',
 "'q'",
 'œ',
 "'j'",
 'f',
 "['-'",
 "'0'",
 "['z'",
 "'b'",
 'D',
 'B',
 'a',
 "'œ'",
 'c',
 'g',
 'é',
 '1',
 "['i'",
 'k',
 "['j'",
 '["\'"',
 'b',
 "['c'",
 'P',
 "'e'",
 'R',
 "'i'",
 'S',
 "['s'",
 'I',
 "'n'",
 'V',
 "'y'",
 "'l'",
 'F',
 "'f'",
 'C',
 "['r'",
 '-',
 "['u'",
 'Q',
 'A',
 'X',
 "['t'",
 'O',
 "'p'",
 'æ',
 "'-'",
 '9',
 'p',
 "'c'",
 "['e'",
 "'z'",
 'h']