In [33]:
import unidecode

In [34]:
# function to remove vowels, common words, numbers, and punctuation, as well as stem the words
# accepts 2 parameters: string for sample and string for language
def clean_sample(sample, language):
    import nltk 
    import re # to use in regular expressions
    from nltk.stem.snowball import SnowballStemmer # to stem words, doesn't work well
    from nltk.corpus import stopwords # to remove common words
    
    # creating objects for stemmer, common words and key for punctuation/numbers to be removed
    stemmer = SnowballStemmer(str(language))
    common_words = stopwords.words(language)
    toRemove = re.compile(r"[aeiou0-9,@\?\.$%_/:() ]")

    # separates sentence into elements and stores in elemList
    elemList=sample.split()
    
    # stemming first
    stems=[]
    for elem in elemList:
        if elem not in common_words:
            w = stemmer.stem(elem)
            stems.append(w)

    #now removing punctuation, numbers, vowels and storing in wordList
    wordList=[]
    for i in stems:
        s = i
        elem = re.sub(toRemove, "", s.lower())
        elem = unidecode.unidecode(elem)
        # NEW NOT IN LD CODE YET: USE OF UNIDECODER
        wordList.append(elem)
    
    # stems words and appends them to list to be returned
    return wordList

In [35]:
# returns a dictionary of form {consonant : count}
def get_c_dict(sample,language):
    # initialize an empty dictionary
    c_dict = {}
    # strip the string input to only consonants
    # first clean sample (returns a list), use join to turn to string
    striped_c = ("").join(clean_sample(sample,language))
    
    # populate the dictionary
    for c in striped_c:
        if c in c_dict.keys():
            # increment count
            c_dict[c] +=1
        else:
            # add to consonant dict
            c_dict[c] = 1
    
    return c_dict

In [36]:
# returns a dictionary of the differences for each found consonant

def get_diff_dict(sampleA,sampleB,langA,langB):
    # initialize empty dictionary
    diff_dict = {}
    
    # strip both strings
    c_dictA = get_c_dict(sampleA,langA)
    c_dictB = get_c_dict(sampleB,langB)
    
    # get all unique consonants found between both strings
    # add both keys, then take the set of them
    # set returns all individual elements, so we only get uniques
    found_consonants = list(c_dictA.keys()) + list(c_dictB.keys())
    uniq_consonants = set(found_consonants)
    
    for c in uniq_consonants:
        if(c in c_dictA and c in c_dictB):
            # a consonant appears in both strings
            # thus can properly subtract them
            # take absolute value so we can normalize later based on some factor
            difference = abs(c_dictA[c] - c_dictB[c])
            diff_dict[c] = difference
            
        elif(c in c_dictA):
            # unique consonant only in one string, cannot subtract
            # thus the difference is the number of appearances of unique consonant in the one string
            diff_dict[c] = c_dictA[c]
        else:
            diff_dict[c] = c_dictB[c]
            
    return diff_dict

In [37]:
# Current issues: accents on string, make sure the regex works (probably should switch to re.sub)
# Not sure if dictionary is best. I used it out of simplicity, but when if we ever wanted to iterate a list of lists
# could be better. I think performance would be similar, but the benefit of a dictionary is that you can simply call 
# d["s"] to get all of the differences for the letter "s".

# Ideas:
# To normalize, we could loop through the dictionary keys to add up the differences, then use some
# factor to get the difference between 0-1, similar to LD normalization
str1 = "Guided by the purposes and principles of the Charter of the United Nations, and expressing in particular the need to achieve international cooperation in promoting and encouraging respect for human rights and fundamental freedoms for all without distinction"
str2 = "Guiado por los propósitos y principios de la Carta de las Naciones Unidas, y expresando en particular la necesidad de lograr la cooperación internacional para promover y alentar el respeto de los derechos humanos y las libertades fundamentales para todos sin distinción"

print("C-dictionary str1:")
print(get_c_dict(str1,"english"))
print("C-dictionary str2")
print(get_c_dict(str2,"spanish"))
print("Differences:")
print(get_diff_dict(str1,str2,"english","spanish"))

C-dictionary str1:
{'g': 3, 'd': 5, 'p': 9, 'r': 14, 's': 6, 'n': 12, 'c': 8, 'l': 2, 'h': 5, 't': 13, 'x': 1, 'v': 1, 'm': 4, 'f': 2, 'w': 1}
C-dictionary str2
{'g': 2, 'p': 9, 'r': 12, 's': 6, 't': 9, 'n': 14, 'c': 9, 'd': 5, 'x': 1, 'l': 6, 'm': 3, 'v': 1, 'h': 2, 'b': 1, 'f': 1}
Differences:
{'r': 2, 'c': 1, 't': 4, 'x': 0, 'n': 2, 'w': 1, 'm': 1, 'g': 1, 'v': 0, 'd': 0, 'h': 3, 'l': 4, 'b': 1, 'f': 1, 's': 0, 'p': 0}
