# Morpheme Analysis in NLP

In [247]:
import csv
import re
import gc
import string
import nltk
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize
import gensim
import gensim.downloader
import math
import matplotlib.pyplot as plot
from collections import defaultdict, Counter
from morphemes import Morphemes
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import tensorflow as tf
tf.config.set_visible_devices([], 'GPU') # idk why m1 needs this (https://stackoverflow.com/q/72441453)
import keras
from keras.utils import Sequence
from keras.models import Sequential, Model
from keras.layers import SimpleRNN, Dense, Activation, Input, LSTM, Embedding

## Encoder-Decoder for Mapping Definitions to Word

This is the structure we will be following:

**Encoder**:
* Input: a definition word encoded as an integer from 0-len(vocabulary), the vocabulary will not contain any stopwords and it will be all lowercased

**Decoder**:
* Output: a sequence of morphemes (from MorphoLex) encoded as an integer from 0-len(morpheme_lexicon), the morphology will NOT include inflectional morphemes

In [248]:
# fetching nltk data and setting up lemmatizer
nltk.download('stopwords')
stops = set(stopwords.words('english'))
# we will also add "'s" to the stops as it appears quite often
stops.add("'s")
lemmatizer = nltk.WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/siraire/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [249]:
# for later, let's reserve a start & end character to our morpheme (with embedding 0, 1)
START_MORPHEME = '^'
END_MORPHEME = '$'

In [250]:
# getting the morpheme data
morpholex_words = [] 
morphemes = set()
morphemes_in = {}
with open('./morphemes_files/morphemes.csv') as file:
    reader = csv.reader(file)
    for word, morphemes_of_word in reader:
        morpholex_words.append(word)
        morphemes_of_word = morphemes_of_word.split()
        if morphemes_of_word:
            morphemes.update(morphemes_of_word)
            morphemes_in[word] = [START_MORPHEME] + morphemes_of_word + [END_MORPHEME]

int_to_morpheme = [START_MORPHEME, END_MORPHEME] + [m for m in morphemes] # our morpheme_lexicon!
morpheme_to_int = {m: i for i, m in enumerate(int_to_morpheme)}
NUMBER_MORPHEMES = len(int_to_morpheme)

In [251]:
morphemes_in['biology']

['^', 'bio', 'log', 'y', '$']

In [252]:
# getting the definitions of a word

# we can get definitions of a word like this:
def get_definitions(word: str) -> list[list[str]]:
    '''
    Returns a list of definitions where each definition is tokenized into words.
    '''
    syns = wordnet.synsets(word)
    return [syn.definition() for syn in syns]

def convert_tag(tag: str):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN # default https://stackoverflow.com/a/46564234
    
def simplify_definition(definition: str) -> list[str]: # will remove stop words and lemmatize the words of the definition
    '''
    Takes a definition and returns a lemmatized, lowercase, destopped definition.
    '''
    # tokenize the definition and get pos of each word
    dws = nltk.pos_tag(nltk.word_tokenize(definition))
    # filter out stopwords/punctuation/lower-case:
    dws = [(w, t) for w, t in dws if w.lower() not in stops and w.lower() not in string.punctuation]
    # lemmatize with the pos
    dws = [lemmatizer.lemmatize(w, convert_tag(t)) for w, t in dws]
    return dws


morpholex_simplified_definitions = {w: [simplify_definition(d) for d in get_definitions(w)] for w in morpholex_words}

In [253]:
morpholex_simplified_definitions['settlement']

[['body',
  'people',
  'settle',
  'far',
  'home',
  'maintain',
  'tie',
  'homeland',
  'inhabitant',
  'remain',
  'national',
  'home',
  'state',
  'literally',
  'home',
  'state',
  'system',
  'government'],
 ['community', 'people', 'small', 'town'],
 ['conclusive', 'resolution', 'matter', 'disposition'],
 ['act', 'colonize', 'establishment', 'colony'],
 ['something', 'settle', 'resolve', 'outcome', 'decision', 'making'],
 ['area', 'group', 'family', 'live', 'together'],
 ['termination',
  'business',
  'operation',
  'use',
  'asset',
  'discharge',
  'liability']]

In [254]:
# to set up the encoding scheme for the input, let's select the most common words used in definitions
VOCAB_SIZE = 20000
common_words = Counter()
for word_dfs in morpholex_simplified_definitions.values():
    for df in word_dfs:
        common_words.update(df)

vocabulary = [w for w, _ in common_words.most_common()[:VOCAB_SIZE]]
vocabulary_to_int = {w: i for i, w in enumerate(vocabulary)}

In [255]:
morpholex_simplified_definitions["scram"]

[['leave', 'immediately', 'use', 'usually', 'imperative', 'form']]

Instead of encoding words as numbers, we could alternatively use a word2vec or some other form of embedding, however, my earlier tests indicated this to be infeasible to be trained on most machines, leading RAM usage to spike up to the 100GB+. However, there should be no reason the RAM usage to be this high, but I just cannot figure out how to set the model up such that used up numpy arrays/tensors get freed such that the space can be used by others.

In [256]:
# embedding the input and output
def embed_definition(definition: list[str]) -> list[int]:
    return [vocabulary_to_int[dw] for dw in definition if dw in vocabulary_to_int]
def embed_morphology(morphemes: list[str]) -> list[int]:
    return [morpheme_to_int[m] for m in morphemes]
def unembed_definition(embedded_def):
    return [vocabulary[i] for i in embedded_def]
def unembed_morphology(embedded_m):
    return [int_to_morpheme[m] for m in embedded_m]

x_data = [] # all definitions
y_data = [] # list of morphemes
row_labels = [] # parallel to the rows
for word in morpholex_words:
    if word in morpholex_simplified_definitions and morpholex_simplified_definitions[word]:
        for definition in morpholex_simplified_definitions[word]:
            x_data.append(embed_definition(definition))
            y_data.append(embed_morphology(morphemes_in[word]))
            row_labels.append(word)


In [257]:
print(morphemes_in['interdenominational'])
print(embed_morphology(morphemes_in['interdenominational']))


['^', 'inter', 'de', 'nomin', 'ate', 'ion', 'al', '$']
[0, 10758, 3091, 6948, 5843, 1994, 6876, 1]


In [268]:
print(embed_definition('''this is a definition 
see how stop words are removed'''.split(' ')))

[6297, 459, 301, 17389]


In [269]:
vocabulary_to_int['definition']

6297

In [262]:
vocabulary[10692]

'n'

In [146]:
print(len(unembed_morphology(max(y_data, key=len))))
print(len(unembed_definition(max(x_data, key=len)))) # actually the definition of "father"

8
42


In [147]:
# now we can encode our input and output into their integer representations
MAX_MORPHEMES_IN_WORD = 12 # output sequence length
MAX_DEFINITION_LENGTH = 50 # input sequence length
number_samples = len(x_data) # == len(y_data)

In [153]:
NUMBER_MORPHEMES

15705

In [149]:
# let's numpy-ify our data
def copy_2d_list_to_nparray(lst, nparr): 
    '''you must ensure that the np array is large enough to hold 
    all values in the list'''
    for x, row in enumerate(lst):
        for y, val in enumerate(row):
            nparr[x,y] = val
    return nparr


x_data_np = copy_2d_list_to_nparray(
    x_data, 
    np.zeros((number_samples, MAX_DEFINITION_LENGTH)))
y_data_np = copy_2d_list_to_nparray(
    y_data,
    np.zeros((number_samples, MAX_MORPHEMES_IN_WORD)))
y_target_np = np.roll(y_data_np, -1)
y_target_np[:, -1] = 0

In [159]:
input_dim = VOCAB_SIZE
output_dim = NUMBER_MORPHEMES
input_seq_len = MAX_DEFINITION_LENGTH
output_seq_len = MAX_MORPHEMES_IN_WORD

latent_dim = 256
embedding_dim = 64

encoder_inputs = Input(shape=(input_seq_len,))
decoder_inputs = Input(shape=(output_seq_len,))

encoder_embedding = Embedding(input_dim, embedding_dim)
decoder_embedding = Embedding(output_dim, embedding_dim)

encoder_lstm = LSTM(latent_dim, return_state=True)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding(encoder_inputs))
encoder_states = [state_h, state_c] # since the decoder gets the states from the encoder and ignores `encoder_outputs`
decoder_outputs, _, _ = decoder_lstm(decoder_embedding(decoder_inputs), initial_state=encoder_states) 

decoder_dense = Dense(output_dim, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# model set up
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_13 (InputLayer)          [(None, 50)]         0           []                               
                                                                                                  
 input_14 (InputLayer)          [(None, 12)]         0           []                               
                                                                                                  
 embedding_12 (Embedding)       (None, 50, 64)       1280000     ['input_13[0][0]']               
                                                                                                  
 embedding_13 (Embedding)       (None, 12, 64)       1005120     ['input_14[0][0]']               
                                                                                            

In [162]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit([x_data_np, y_data_np], y_target_np,
          batch_size=32,
          epochs=20,
          validation_split=.2
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x34636db10>

In [163]:
model.save(f"./models/DefintionsSeq2Seq-batch32")



INFO:tensorflow:Assets written to: DefintionsSeq2Seq-batch32/assets


INFO:tensorflow:Assets written to: DefintionsSeq2Seq-batch32/assets


In [None]:
# loading model in
model = keras.models.load_model('./models/DefinitionsSeq2Seq-batch32')

In [171]:
model.layers[4].output

[<KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'lstm_12')>,
 <KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'lstm_12')>,
 <KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'lstm_12')>]

In [172]:
# extracting separate encoder/decoder models from loaded in model (make sure to run the model definer first)

# # fetching encoder
# encoder_model = keras.Model(encoder_inputs, encoder_states)

# # fetching decoder
# decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# Define the inference encoder model, which takes in definition input and returns encoder states
encoder_model = Model(encoder_inputs, encoder_states)

# Define the inference decoder model, which takes in decoder input and encoder states and returns predicted morphemes
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding(decoder_inputs), initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)



In [272]:
def predict_word(definition: string):
    dws = simplify_definition(definition)
    dws = embed_definition(dws)
    dws = copy_2d_list_to_nparray([dws], np.zeros((1, MAX_DEFINITION_LENGTH)))
    states_value = encoder_model.predict(dws, verbose=0)
    target_seq = np.zeros((1,1)) # 0 should be the start morpheme, but just in case:
    target_seq[0,0] = morpheme_to_int[START_MORPHEME]

    decoded_morphemes = []
    while True:
        # Generate output probabilities and updated decoder states for the current target sequence and encoder states
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)

        # Sample an output token based on the probabilities
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token_prob = output_tokens[0, -1, sampled_token_index]
        
        # Stop if we have reached the end token or have generated the maximum number of allowed morphemes
        if (sampled_token_index == morpheme_to_int[END_MORPHEME]) or (len(decoded_morphemes) >= MAX_MORPHEMES_IN_WORD):
            break

        # Append the decoded morpheme to the list and update the target sequence and decoder states
        decoded_morphemes.append((int_to_morpheme[sampled_token_index], sampled_token_prob))
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_morphemes

print(predict_word('a form of entertainment broadcast on television'))
print(predict_word('instructions for solving a problem'))
print(predict_word('life sized computer that is sentient'))
print(predict_word('a life threatening medical procedure on the heart performed by doctors'))
print(predict_word('relationships between countries on different planets or in space'))
print(predict_word('this is a definition see how stop words are removed'))
    

[('video', 0.7771693)]
[('solut', 0.33024442), ('ion', 0.9999695)]
[('esteem', 0.17441608), ('ate', 0.85529035)]
[('surg', 0.3888536), ('ory', 0.9905764)]
[('differ', 0.19271624), ('ant', 0.93125033), ('ate', 0.9501478)]
[('plank', 0.4771453)]
