In [None]:
import nltk
import spacy
import Levenshtein

"""
This program uses a "length method" in an attempt to lemmatize derivational changes of words, such as "apply" and "application".
In traditional NLP lemmatizers like NLTK and SpaCy, they only focuses on inflectional changes like "-s" or "-ed",
which are only temporary grammar changes.
However, if we want to consider "apply" and derived "application" as the same word since they share the same origin and being related in meaning,
and only differed in the Part of Speech, we may need to discover other alternative methods.
For the mass users, derivational lemmatizations can help them to acquire a wider range of related lexical information simutaneously,
which can be convenient during information searching.
In this program, an alternative method considering different word part legnth parameters and word formation structures is introduced.
For the prefixes, they are usually meaning carrying like "un-" or "dis-"'s negation.
Hence, it is still not preferable to overlemmatize "unhappy" as "happy" because this will cause inconvenience in real applications.
"""

# Define a target list for desired derivational lemmas (personalized)
target = {"apply"}
# Use specific, large corpora can perform more accurate in finding the most suitable candidate
freqlist = nltk.FreqDist(nltk.corpus.brown.words())

nlp = spacy.load('en_core_web_sm')

# The basic SpaCy lemmatizer handling inflectional changes
def inf_lemmatize(word: str) -> str:

    word = word.strip().lower()
    doc = nlp(word)

    for token in doc:
        return token.lemma_

def my_lemmatizer(word: str) -> str:

    # Check word string validity
    if not word:
        return None
    
    # Take the last word for lemmatization
    word = word.split()[-1]
    word = word.strip().lower()

    # Use SpaCy to inflectionally lemmatize first
    spacy_lemma = inf_lemmatize(word)
    if spacy_lemma in target:
        return spacy_lemma
    
    # According to the BNC retrieved from Sketch Engine,
    # the average length of top 1000 frequent noun, verb and adjective is 6.403.
    # Which 3000 words have already included affixes,
    # and it means that the "lemma root" will be certainly shorter than 6 in length.
    # Therefore, an average length of roots 3 (6/2) is tried to apply considering to short words,
    # and len(word)//2 alternative is further availiable for long words to better balance on not being overlemmatized.
    average_root_length = max(3, len(word)//2)

    suffix_part = len(word)-average_root_length

    for candidate in freqlist:

        candidate = candidate.lower()

        # Keeping the "non-suffix" part to check how slimiar are the word and lemma candidate
        sliced = candidate[:average_root_length]

        # "Underived" lemma should usually be shorter than the the derivant
        # When a word starts with the "sliced" part, they have the potential to be the deriavation pair in further checking
        # (like "apply" and "application" are both start with "app")
        # The small Levenshtein distance ensures the pair are only differed for the suffixes
        # Levenshtein distance is chosen because of its flexibility (+1 for both add, delete and replace)
        if len(candidate)<len(word) and \
            word.startswith(sliced) and \
            Levenshtein.distance(word, candidate)<=suffix_part:
                
                return candidate

# Continously find the lemma in the target.
# application -> applied -> apply
# Personalized list are perfered to adjust different strictness (apply -> app may not be a desired result)
def my_lemmatizer_recursion(word: str) -> str:
    lemma = my_lemmatizer(word)

    if not lemma:
        return None
    
    if lemma in target:
        return lemma
    
    return my_lemmatizer_recursion(lemma)

print(my_lemmatizer_recursion("application"))



apply
