In [48]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [49]:
import torch
import random
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.utils import to_categorical
from transformers import AutoTokenizer

In [50]:
writ_data = []
phon_data = []
filepath = '/content/drive/MyDrive/phonemized_list_shuffled_no_cedilla'
with open(filepath, 'r') as f0:
  for line in f0:
    parts = line.split()
    #written form of words
    writ_singular = parts[0]
    writ_plural = parts[1]
    writ_data.append((writ_singular, writ_plural))
    #phonemized form of words
    phon_singular = parts[2]
    phon_plural = parts[3]
    phon_data.append((phon_singular, phon_plural))

In [51]:
#randomly selecting writ_data to create a subset
writ_data = random.sample(writ_data, 8500)

spellings = set(' '.join([writ_plural for _, writ_plural in writ_data]))
spellings.add('post')
spelling_to_idx = {spelling: idx for idx, spelling in enumerate(spellings)}

def writ_encode_sequences(writ_sequences, spelling_to_idx):
    writ_encoded_seqs = []
    for writ_sequence in writ_sequences:
        writ_encoded_seq = [spelling_to_idx[spelling] for spelling in writ_sequence]
        writ_encoded_seqs.append(writ_encoded_seq)
    return writ_encoded_seqs

In [52]:
#randomly selecting phon_data to create a subset
phon_data = random.sample(phon_data, 8500)

phonemes = set(' '.join([phon_plural for _, phon_plural in phon_data]))
phonemes.add('post')
phoneme_to_idx = {phoneme: idx for idx, phoneme in enumerate(phonemes)}

def phon_encode_sequences(phon_sequences, phoneme_to_idx):
    phon_encoded_seqs = []
    for phon_sequence in phon_sequences:
        phon_encoded_seq = [phoneme_to_idx[phoneme] for phoneme in phon_sequence]
        phon_encoded_seqs.append(phon_encoded_seq)
    return phon_encoded_seqs

In [53]:
#making writ_train and writ_test datasets
writ_train_data, writ_test_data = train_test_split(writ_data, train_size=0.8, test_size=0.2, shuffle=True)

#and seperate for singular/plural
writ_trainData_singular = [writ_singular for writ_singular, _ in writ_train_data]
writ_trainData_plural = [writ_plural for _, writ_plural in writ_train_data]
writ_testData_singular = [writ_singular for writ_singular, _ in writ_test_data]
writ_testData_plural = [writ_plural for _, writ_plural in writ_test_data]

writ_num_class = len(np.unique(writ_testData_plural)) + 2
writ_train_max_seq_len = max(len(seq) for seq in writ_trainData_singular + writ_trainData_plural)
writ_test_max_seq_len = max(len(seq) for seq in writ_testData_singular + writ_testData_plural)

In [54]:
#making phon_train and phon_test datasets
phon_train_data, phon_test_data = train_test_split(phon_data, train_size=0.8, test_size=0.2, shuffle=True)

#and seperate for singular/plural
phon_trainData_singular = [phon_singular for phon_singular, _ in phon_train_data]
phon_trainData_plural = [phon_plural for _, phon_plural in phon_train_data]
phon_testData_singular = [phon_singular for phon_singular, _ in phon_test_data]
phon_testData_plural = [phon_plural for _, phon_plural in phon_test_data]

phon_num_class = len(np.unique(phon_testData_plural)) + 2
phon_train_max_seq_len = max(len(seq) for seq in phon_trainData_singular + phon_trainData_plural)
phon_test_max_seq_len = max(len(seq) for seq in phon_testData_singular + phon_testData_plural)

In [55]:
#padding process for writ_datasets
writ_trainData_singular = pad_sequences(writ_encode_sequences(writ_trainData_singular, spelling_to_idx),
                                   maxlen = writ_train_max_seq_len, padding = 'post')
writ_trainData_plural = pad_sequences(writ_encode_sequences(writ_trainData_plural, spelling_to_idx),
                                   maxlen = writ_train_max_seq_len, padding = 'post')
writ_trainData_plural_encoded = to_categorical(writ_trainData_plural, writ_num_class)

writ_testData_singular = pad_sequences(writ_encode_sequences(writ_testData_singular, spelling_to_idx),
                                   maxlen = writ_test_max_seq_len, padding = 'post')
writ_testData_plural = pad_sequences(writ_encode_sequences(writ_testData_plural, spelling_to_idx),
                                   maxlen = writ_test_max_seq_len, padding = 'post')
writ_testData_plural_encoded = to_categorical(writ_testData_plural, writ_num_class)

In [56]:
#padding process for phon_datasets
phon_trainData_singular = pad_sequences(phon_encode_sequences(phon_trainData_singular, phoneme_to_idx),
                                   maxlen = phon_train_max_seq_len, padding = 'post')
phon_trainData_plural = pad_sequences(phon_encode_sequences(phon_trainData_plural, phoneme_to_idx),
                                   maxlen = phon_train_max_seq_len, padding = 'post')
phon_trainData_plural_encoded = to_categorical(phon_trainData_plural, phon_num_class)

phon_testData_singular = pad_sequences(phon_encode_sequences(phon_testData_singular, phoneme_to_idx),
                                   maxlen = phon_test_max_seq_len, padding = 'post')
phon_testData_plural = pad_sequences(phon_encode_sequences(phon_testData_plural, phoneme_to_idx),
                                   maxlen = phon_test_max_seq_len, padding = 'post')
phon_testData_plural_encoded = to_categorical(phon_testData_plural, phon_num_class)

In [57]:
#creating model for writ_datasets
writ_model = Sequential()
writ_embedding_dim = 50
writ_input_dim = len(spellings)
writ_model.add(Embedding(writ_input_dim, writ_embedding_dim))
writ_model.add(Dense(128, activation='relu'))
writ_model.add(Dense(writ_num_class, activation='softmax'))
writ_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [58]:
#creating model for phon_datasets
phon_model = Sequential()
phon_embedding_dim = 50
phon_input_dim = len(phonemes)
phon_model.add(Embedding(phon_input_dim, phon_embedding_dim))
phon_model.add(Dense(128, activation='relu'))
phon_model.add(Dense(phon_num_class, activation='softmax'))
phon_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
writ_model.fit(writ_trainData_singular, writ_trainData_plural_encoded, epochs=10, batch_size=32, validation_split=0.2)

#writ_trainData
writ_train_loss, writ_trainData_accuracy = writ_model.evaluate(writ_trainData_singular, writ_trainData_plural_encoded)
print(f"writ_trainData Accuracy: {writ_trainData_accuracy}")

Epoch 1/10
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - accuracy: 0.8714 - loss: 3.6009

In [None]:
#writ_testData
writ_test_loss, writ_testData_accuracy = writ_model.evaluate(writ_testData_singular, writ_testData_plural_encoded)
print(f"writ_testData Accuracy: {writ_testData_accuracy}")

In [None]:
phon_model.fit(phon_trainData_singular, phon_trainData_plural_encoded, epochs=10, batch_size=32, validation_split=0.2)

#phon_trainData
phon_train_loss, phon_trainData_accuracy = phon_model.evaluate(phon_trainData_singular, phon_trainData_plural_encoded)
print(f"phon_trainData Accuracy: {phon_trainData_accuracy}")

In [None]:
#phon_testData
phon_test_loss, phon_testData_accuracy = phon_model.evaluate(phon_testData_singular, phon_testData_plural_encoded)
print(f"phon_testData Accuracy: {phon_testData_accuracy}")