In [26]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from read_tab_files import TabFileReader

In [33]:
barb_forms = TabFileReader.tab_reader(
    "chl2024_barbacoandata/chl2023_barbacoan_forms.tab"
)

word_list = TabFileReader.get_word_list(barb_forms)

# convert words to character sequences
tokens = [list(word) for word in word_list]

# flatten the list of chars to fit LabelEncoder
flat_tokens = [item for sublist in tokens for item in sublist]

# encode characters as integers
encoder = LabelEncoder()
encoder.fit(flat_tokens)
encoded_tokens = [encoder.transform(word) for word in tokens]

# specify sequence length (how long each seq should be interpreted as)
sequence_length = 4

X = []
y = []

# go through each word and get every sequence with the seq len (hello > hel, ell, llo if seq_len = 3)
for seq in encoded_tokens:
    for i in range(sequence_length, len(seq)):
        X.append(seq[i - sequence_length : i])
        y.append(seq[i])


# prep data for training
X = np.array(X)
# reshape X to fit the LSTM input [samples, time steps, features]
X = np.reshape(X, (X.shape[0], X.shape[1], 1))
y = to_categorical(y)

In [34]:
model = Sequential(
    [
        # memory units (50 is good starting num), seq_len, step size (1)
        LSTM(100, input_shape=(X.shape[1], X.shape[2])),
        Dropout(0.2),
        # num unique chars
        Dense(y.shape[1], activation="softmax"),
    ]
)

model.compile(loss="categorical_crossentropy", optimizer="adam")
model.fit(X, y, epochs=200, verbose=1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.src.callbacks.History at 0x16bb2e550>

In [35]:
def generate_word(model, input_text, extend_length, encoder):
    # pad text if its length is less than seq_len
    if len(input_text) < sequence_length:
        input_text = " " * (sequence_length - len(input_text)) + input_text

    for _ in range(extend_length):
        encoded = encoder.transform(list(input_text[-sequence_length:]))
        encoded = np.reshape(encoded, (1, sequence_length, 1))

        # predict next character
        pred = model.predict(encoded, verbose=0)
        next_char = encoder.inverse_transform([np.argmax(pred)])
        input_text += next_char[0]

    return input_text

In [38]:
cognates = ["pi", "chu", "mɨ", "lul"]
lengths = list(range(1, 6))

for cognate in cognates:
    print("Cognate:", cognate)
    for length in lengths:
        new_word = generate_word(model, cognate, length, encoder)
        print("  Extra Length:", length)
        print("  Generated Word:", new_word)
    print()

Cognate: pi
  Extra Length: 1
  Generated Word:   pin
  Extra Length: 2
  Generated Word:   pina
  Extra Length: 3
  Generated Word:   pinan
  Extra Length: 4
  Generated Word:   pinanp
  Extra Length: 5
  Generated Word:   pinanpa

Cognate: chu
  Extra Length: 1
  Generated Word:  chun
  Extra Length: 2
  Generated Word:  chunk
  Extra Length: 3
  Generated Word:  chunka
  Extra Length: 4
  Generated Word:  chunka 
  Extra Length: 5
  Generated Word:  chunka m

Cognate: mɨ
  Extra Length: 1
  Generated Word:   mɨp
  Extra Length: 2
  Generated Word:   mɨpɨ
  Extra Length: 3
  Generated Word:   mɨpɨp
  Extra Length: 4
  Generated Word:   mɨpɨpɨ
  Extra Length: 5
  Generated Word:   mɨpɨpɨp

Cognate: lul
  Extra Length: 1
  Generated Word:  lulɨ
  Extra Length: 2
  Generated Word:  lulɨp
  Extra Length: 3
  Generated Word:  lulɨpɨ
  Extra Length: 4
  Generated Word:  lulɨpɨp
  Extra Length: 5
  Generated Word:  lulɨpɨpɨ



In [31]:
new_word = generate_word(model, "pi", 2, encoder)
print("Generated word:", new_word)

Generated word: pika
Generated word: chuhka
Generated word: mɨm
Generated word: lulan
