In [16]:
# function to calculate l-distance between two strings
# accepts two string parameters: 
def ldistance(strA, strB):
    import numpy as np # to create matrix
    
    #setting up matrix
    size_x = len(strA) + 1
    size_y = len(strB) + 1
    matrix = np.zeros((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y
    
    # prepopulation
    print(matrix)
    
    #populating matrix
    
    # start at 1 so don't hit zero
    for x in range(1, size_x):
        for y in range(1, size_y):
            # then must decrease by 1 because of that
            if strA[x-1] == strB[y-1]: #elements are equal
                matrix[x][y] = matrix[x-1][y-1]
            else: 
                # the three Levenshtein conditions for min
                # since all are adding one at end, can check prior to addition
                a = matrix[x][y-1]
                b = matrix[x-1][y]
                c = matrix[x-1][y-1]
                # a is min
                if (a <= b and a <= c):
                    matrix[x][y] = a + 1
                # b is min
                elif (b <= a and b <= c):
                    matrix[x][y] = b + 1
                # c is min
                else:
                    matrix[x][y] = c + 1
    # postpopulation
    print(matrix)
                    
    # bottom-right element is l-distance
    distance = matrix[-1][-1]
    
    # distance
    print(distance)
    
    # taking into consideration the length of the words
    # normalize based on longest word
    length = 0
    if ((len(strA) > len(strB)) or (len(strA) == len(strB))):
        length = len(strA)
    else:
        length = len(strB)
        
    lscore = ((length - distance) / length) 
    
    return lscore

In [17]:
ldistance("hello","smellow")

[[0. 1. 2. 3. 4. 5. 6. 7.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [2. 0. 0. 0. 0. 0. 0. 0.]
 [3. 0. 0. 0. 0. 0. 0. 0.]
 [4. 0. 0. 0. 0. 0. 0. 0.]
 [5. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 1. 2. 3. 4. 5. 6. 7.]
 [1. 1. 2. 3. 4. 5. 6. 7.]
 [2. 2. 2. 2. 3. 4. 5. 6.]
 [3. 3. 3. 3. 2. 3. 4. 5.]
 [4. 4. 4. 4. 3. 2. 3. 4.]
 [5. 5. 5. 5. 4. 3. 2. 3.]]
3.0


0.5714285714285714

In [2]:
# function to remove vowels, common words, numbers, and punctuation, as well as stem the words
# accepts 2 parameters: string for sample and string for language
def clean_sample(sample, language):
    import nltk 
    import re # to use in regular expressions
    from nltk.stem.snowball import SnowballStemmer # to stem words, doesn't work well
    from nltk.corpus import stopwords # to remove common words
    
    # creating objects for stemmer, common words and key for punctuation/numbers to be removed
    stemmer = SnowballStemmer(str(language))
    common_words = stopwords.words(language)
    punct = re.compile(r"[,@\'?\.$%_/:()]")
    numbers = re.compile(r"[0-9]")

    # separates sentence into elements and stores in elemList
    sentences=[]
    elemList=sample.split()

    #removes punctuation, numbers, common words and stores in wordList
    wordList=[]
    for i in range(len(elemList)):
        s = elemList[i]
        find = re.search(punct, s)
        findNum = re.search(numbers, s)
        if(find == None and findNum == None):
            if s not in common_words:
                wordList.append(s)

    # stems words and appends them to list to be returned
    stems=[]
    for word in wordList:
        w = stemmer.stem(word)
        stems.append(w)
        
    return stems

In [3]:
# removes vowels
# parameter: list of words from sample
def remove_vowels(wordList):
    import re # to use in regular expressions
    vowels = re.compile(r"a|e|i|o|u") # key to find vowels - differs between languages
    
    # omitting vowels from words
    for x in range(len(wordList)):
        w = ""
        for i in range(len(wordList[x])):
            find = re.search(vowels, wordList[x][i:i+1])
            if find == None:
                w += wordList[x][i:i+1]
        wordList[x] = w
        
    return wordList

In [4]:
# function that lines up pairs of words between the two samples, and calculates their l-distance
# parameters: 2 lists of words from each sample, 1 list of manually-created alignment between the two lists
def measure_samples(list1, list2, alignment):
    distances=[]
    
    # aligning pairs
    for i in range(len(alignment)):
        idx1 = alignment[i][0]
        idx2 = alignment[i][1]
        str1 = list1[idx1]
        str2 = list2[idx2]
        print(str1 + "-" + str2)
        l = ldistance(str(str1), str(str2))
        distances.append(l)
        
    return distances

In [5]:
# function to concatenate all consonants in the sample
# parameters: a list of all words in a sample
def concat_all(wordList):
    fullString=""
    for word in wordList:
        fullString += word
        
    return fullString

In [6]:
import nltk
from nltk.translate import Alignment
engSample = "Guided by the purposes and principles of the Charter of the United Nations, and expressing in particular the need to achieve international cooperation in promoting and encouraging respect for human rights and fundamental freedoms for all without distinction"
spaSample = "Guiado por los propósitos y principios de la Carta de las Naciones Unidas, y expresando en particular la necesidad de lograr la cooperación internacional para promover y alentar el respeto de los derechos humanos y las libertades fundamentales para todos sin distinción"

#Removing vowels, punctuation, stemming
engWords = clean_sample(engSample, "english")
spaWords = clean_sample(spaSample, "spanish")
engWords = remove_vowels(engWords)
spaWords = remove_vowels(spaWords)

#Manually transcribing sentence alignment to compare word-to-word
print("Spa-Eng Pairs\n")
eng_spa_align = [(0,0), (1,1), (2,2), (3,3), (4,4), (5,5), (6,6), (7,7), (8,8), (9, 10), (10, 9), (11,11), (12, 12), (13, 13), (14, 15), (15, 14), (16, 17), (17, 16), (19, 18)]


#Passing lists of consonant-only words in both languages and their alignment to Ldistnance method
measures = measure_samples(engWords, spaWords, eng_spa_align)
print("\n")
print("English consonants\n")
print(engWords)
print("\n")
print("Spanish consonants\n")
print(spaWords)
print("\n")
print("L-Distance measures per pair of aligned words\n")
print(measures)
print("\n")

print("Average")
avg = 0
for i in range(len(measures)):
    avg = avg + measures[i]
avg/len(measures)

Spa-Eng Pairs

gd-g
prps-prpst
prncpl-prncp
chrtr-crt
nt-ncn
xprss-xprs
prtclr-prtcl
nd-ncs
chv-lgr
ntrn-ntrncnl
cpr-cpr
prmt-prmv
ncrg-lnt
rspct-rspt
hmn-hmn
rght-drch
fndmnt-fndmntl
frdm-lbrtd
dstnct-dstncn


English consonants

['gd', 'prps', 'prncpl', 'chrtr', 'nt', 'xprss', 'prtclr', 'nd', 'chv', 'ntrn', 'cpr', 'prmt', 'ncrg', 'rspct', 'hmn', 'rght', 'fndmnt', 'frdm', 'wtht', 'dstnct']


Spanish consonants

['g', 'prpst', 'prncp', 'crt', 'ncn', 'xprs', 'prtcl', 'ncs', 'lgr', 'cpr', 'ntrncnl', 'prmv', 'lnt', 'rspt', 'drch', 'hmn', 'lbrtd', 'fndmntl', 'dstncn']


L-Distance measures per pair of aligned words

[0.5, 0.8, 0.8333333333333334, 0.6, 0.3333333333333333, 0.8, 0.8333333333333334, 0.3333333333333333, 0.0, 0.5714285714285714, 1.0, 0.75, 0.0, 0.8, 1.0, 0.25, 0.8571428571428571, 0.2, 0.8333333333333334]


Average


0.5944862155388472

In [7]:
import nltk
from nltk.translate import Alignment
engSample = "Guided by the purposes and principles of the Charter of the United Nations, and expressing in particular the need to achieve international cooperation in promoting and encouraging respect for human rights and fundamental freedoms for all without distinction"
spaSample = "Guiado por los propósitos y principios de la Carta de las Naciones Unidas, y expresando en particular la necesidad de lograr la cooperación internacional para promover y alentar el respeto de los derechos humanos y las libertades fundamentales para todos sin distinción"
engWords = clean_sample(engSample, "english")
spaWords = clean_sample(spaSample, "spanish")
engWords = remove_vowels(engWords)
spaWords = remove_vowels(spaWords)

print("Concatenation of all English consonants")

print(concat_all(engWords))
print("\n")
print("Concatenation of all Spanish consonants")
print(concat_all(spaWords))
print("\n")
print("Distance between the two concatenations")
ldistance(engWords, spaWords)

Concatenation of all English consonants
gdprpsprncplchrtrntxprssprtclrndchvntrncprprmtncrgrspcthmnrghtfndmntfrdmwthtdstnct


Concatenation of all Spanish consonants
gprpstprncpcrtncnxprsprtclncslgrcprntrncnlprmvlntrsptdrchhmnlbrtdfndmntldstncn


Distance between the two concatenations


0.05