In [14]:
import unidecode

In [15]:
# function to remove vowels, common words, numbers, and punctuation, as well as stem the words
# accepts 2 parameters: string for sample and string for language
def clean_sample(sample, language):
    import nltk 
    import re # to use in regular expressions
    from nltk.stem.snowball import SnowballStemmer # to stem words, doesn't work well
    from nltk.corpus import stopwords # to remove common words
    
    # creating objects for stemmer, common words and key for punctuation/numbers to be removed
    stemmer = SnowballStemmer(str(language))
    common_words = stopwords.words(language)
    toRemove = re.compile(r"[aeiou0-9,@\?\.$%_/:() ]")

    # separates sentence into elements and stores in elemList
    elemList=sample.split()
    
    # stemming first
    stems=[]
    for elem in elemList:
        if elem not in common_words:
            w = stemmer.stem(elem)
            stems.append(w)

    #now removing punctuation, numbers, vowels and storing in wordList
    wordList=[]
    for i in stems:
        s = i
        elem = re.sub(toRemove, "", s.lower())
        elem = unidecode.unidecode(elem)
        # NEW NOT IN LD CODE YET: USE OF UNIDECODER
        wordList.append(elem)
    
    # stems words and appends them to list to be returned
    return wordList

In [16]:
# function to remove vowels, numbers, and punctuation
# accepts 1 parameters: string for sample
def only_consonants(aStr):
    import re # to use in regular expressions
    
    # creating key for punctuation/numbers to be removed
    # NEW: ADDED ' CHARACTER AND ! CHAR
    toRemove = re.compile(r"[aeiou0-9,@\?\.$%_/\!:()' ]")

    # separates sentence into elements and stores in elemList
    aStr = unidecode.unidecode(aStr.lower())
    elemList=aStr.split()
    
    #now removing punctuation, numbers, vowels and storing in wordList
    wordList=[]
    for i in elemList:
        elem = re.sub(toRemove, "", i)
        wordList.append(elem)
    
    # stems words and appends them to list to be returned
    return wordList

In [17]:
# returns a dictionary of form {consonant : count}
# Note: not using stemmer here, just stripping extracting the consonants.
def get_c_dict(sample):
    # initialize an empty dictionary
    c_dict = {}
    # strip the string input to only consonants
    # first clean sample (returns a list), use join to turn to string
    striped_c = ("").join(only_consonants(sample))
    
    # populate the dictionary
    for c in striped_c:
        if c in c_dict.keys():
            # increment count
            c_dict[c] +=1
        else:
            # add to consonant dict
            c_dict[c] = 1
    
    return c_dict

In [18]:
# returns a dictionary of the differences for each found consonant

def get_diff_dict(sampleA,sampleB):
    # initialize empty dictionary
    diff_dict = {}
    
    # strip both strings
    c_dictA = get_c_dict(sampleA)
    c_dictB = get_c_dict(sampleB)
    
    # get all unique consonants found between both strings
    # add both keys, then take the set of them
    # set returns all individual elements, so we only get uniques
    found_consonants = list(c_dictA.keys()) + list(c_dictB.keys())
    uniq_consonants = set(found_consonants)
    
    for c in uniq_consonants:
        if(c in c_dictA and c in c_dictB):
            # a consonant appears in both strings
            # thus can properly subtract them
            # take absolute value so we can normalize later based on some factor
            difference = abs(c_dictA[c] - c_dictB[c])
            diff_dict[c] = difference
            
        elif(c in c_dictA):
            # unique consonant only in one string, cannot subtract
            # thus the difference is the number of appearances of unique consonant in the one string
            diff_dict[c] = c_dictA[c]
        else:
            diff_dict[c] = c_dictB[c]
            
    return diff_dict

In [19]:
def alpha_sort_dict(aDict):
    """Returns an alphabetically sorted dictionary."""
    sorted_d = {}
    for i in sorted(aDict):
        sorted_d[i] = aDict[i]
        
    return sorted_d

In [20]:
eng_doc = """The General Assembly,

Recalling that, in accordance with Article 56 of the Charter of the United Nations, all Member States have pledged themselves to take joint and separate action in cooperation with the Organization for the achievement of the purposes set forth in Article 55, including universal respect for and observance of, human rights and fundamental freedoms for all without distinction as to race, sex, language or religion,

Recalling also the Preamble to the Charter, in particular the determination to reaffirm faith in fundamental human rights, in the dignity and worth of the human person and in the equal rights of men and women and of nations large and small,

Reaffirming that the promotion and protection of all human rights and fundamental freedoms must be considered a priority objective of the United Nations in accordance with its purposes and principles, in particular the purpose of international cooperation, and that, within the framework of these purposes and principles, the promotion and protection of all human rights are a legitimate concern of the international community,

Considering the major changes taking place on the international scene and the aspirations of all peoples to an international order based on the principles enshrined in the Charter, including promoting and encouraging respect for human rights and fundamental freedoms for all and respect for the principle of equal rights and self-determination of peoples, peace, democracy, justice, equality, rule of law, pluralism, development, better standards of living and solidarity,

Recognizing that the international community should devise ways and means to remove current obstacles and meet the challenges to the full realization of all human rights and to prevent the continuation of human rights violations resulting therefrom throughout the world, as well as continue to pay attention to the importance of mutual cooperation, understanding and dialogue in ensuring the promotion and protection of all human rights,

Reaffirming that the enhancement of international cooperation in the field of human rights is essential for the full achievement of the purposes of the United Nations and that human rights and fundamental freedoms are the birthright of all human beings, the protection and promotion of such rights and freedoms being the first responsibility of Governments,

Reaffirming also that all human rights are universal, indivisible, interdependent and interrelated and that the international community must treat human rights globally in a fair and equal manner, on the same footing and with the same emphasis,

Reaffirming further the various articles of the Charter setting out the respective powers and functions of the General Assembly, the Security Council and the Economic and Social Council, as the paramount framework for the achievement of the purposes of the United Nations,

Reaffirming the commitment of all States to fulfil their obligations under other important instruments of international law, in particular those of international human rights and humanitarian law,

Taking into account that, in accordance with Article 103 of the Charter, in the event of a conflict between the obligations of the Members of the United Nations under the Charter and their obligations under any other international agreement, their obligations under the Charter shall prevail"""

fr_doc = """L'Assemblée générale,

Rappelant que, conformément à l'Article 56 de la Charte des Nations Unies, tous les États Membres se sont engagés à agir, tant conjointement que séparément, en coopération avec l'Organisation, en vue d'atteindre les buts énoncés à l'Article 55 de la Charte, notamment le respect universel et effectif des droits de l'homme et des libertés fondamentales pour tous, sans distinction de race, de sexe, de langue ou de religion,

Rappelant également le Préambule de la Charte, dans lequel les peuples des Nations Unies se sont déclarés résolus, en particulier, à proclamer à nouveau leur foi dans les droits fondamentaux de l'homme, dans la dignité et la valeur de la personne humaine et dans l'égalité de droits des hommes et des femmes, ainsi que des nations, grandes et petites,

Réaffirmant que la défense et la protection de tous les droits de l'homme et de toutes les libertés fondamentales doivent être considérées comme un objectif prioritaire des Nations Unies conformément à ses buts et principes, en particulier le but de la coopération internationale, et que, dans le cadre de ces buts et principes, la défense et la protection de tous les droits de l'homme sont une préoccupation légitime de la communauté internationale,

Considérant les changements notoires qui se produisent dans le monde et le fait que tous les peuples aspirent à un ordre international fondé sur les principes consacrés par la Charte, notamment la nécessité de promouvoir et d'encourager le respect des droits de l'homme et des libertés fondamentales pour tous et le respect du principe de l'égalité de droits et de l'autodétermination des peuples, la paix, la démocratie, la justice, l'égalité, la primauté du droit, le pluralisme, le développement, l'instauration de meilleures conditions de vie et la solidarité,

Considérant également que la communauté internationale devrait chercher les moyens d'écarter les obstacles qui s'opposent aujourd'hui à la pleine réalisation de tous les droits de l'homme et mettre un terme aux violations des droits de l'homme qui en résultent dans le monde, ainsi que continuer à accorder l'attention voulue à l'

importance de la coopération mutuelle, de la compréhension mutuelle et du dialogue comme moyens d'assurer la défense et la protection de tous les droits de l'homme,

Réaffirmant qu'il est essentiel de renforcer la coopération internationale dans le domaine des droits de l'homme pour assurer la pleine réalisation des objectifs des Nations Unies et que les droits de l'homme et les libertés fondamentales sont inhérents à tous les êtres humains, la défense et la protection de ces droits et libertés incombant au premier chef aux gouvernements,

Réaffirmant également que tous les droits de l'homme sont universels, indissociables, interdépendants et étroitement liés et que la communauté internationale doit les considérer globalement et les traiter tous de la même manière, en les mettant sur un pied d'égalité et en leur accordant le même poids,

Réaffirmant en outre les divers articles de la Charte où sont définis les fonctions et pouvoirs respectifs de l'Assemblée générale, du Conseil de sécurité et du Conseil économique et social, qui servent de tremplin pour la réalisation des buts des Nations Unies,

Réaffirmant que les États se sont engagés à s'acquitter des obligations qui leur incombent en vertu d'autres instruments importants du droit international, en particulier ceux qui traitent des droits de l'homme internationalement reconnus et du droit international humanitaire,

Considérant que, conformément à l'Article 103 de la Charte, en cas de conflit entre les obligations des Membres des Nations Unies en vertu de la Charte et leurs obligations en vertu de tout autre accord international, les premières prévaudront,
"""

get_c_dict(eng_doc)['t']

292

In [21]:
def get_int_dict_count(aDiffDict):
    """This function returns the count of all the differences between
        consonant counts in a dictionary of consonant differences
        (see get_diff_dict)"""
    diff_sum = 0;
    for key in aDiffDict.keys():
        diff_sum += aDiffDict[key]
        
    return diff_sum

In [22]:
# Current issues: accents on string, make sure the regex works (probably should switch to re.sub)
# Not sure if dictionary is best. I used it out of simplicity, but when if we ever wanted to iterate a list of lists
# could be better. I think performance would be similar, but the benefit of a dictionary is that you can simply call 
# d["s"] to get all of the differences for the letter "s".

# Ideas:
# To normalize, we could loop through the dictionary keys to add up the differences, then use some
# factor to get the difference between 0-1, similar to LD normalization
str1 = "Guided by the purposes and principles of the Charter of the United Nations, and expressing in particular the need to achieve international cooperation in promoting and encouraging respect for human rights and fundamental freedoms for all without distinction"
str2 = "Guiado por los propósitos y principios de la Carta de las Naciones Unidas, y expresando en particular la necesidad de lograr la cooperación internacional para promover y alentar el respeto de los derechos humanos y las libertades fundamentales para todos sin distinción"
cDict1 = alpha_sort_dict(get_c_dict(str1))
cDict2 = alpha_sort_dict(get_c_dict(str2))
diffDict = alpha_sort_dict(get_diff_dict(str1,str2))

print("String one consonants:")
print(cDict1)

print("\nString two consonants:")
print(cDict2)

print("\nDifference:")
print(diffDict)

print("Total differences:")
print(get_int_dict_count(diffDict))

String one consonants:
{'b': 1, 'c': 8, 'd': 11, 'f': 6, 'g': 6, 'h': 9, 'l': 6, 'm': 4, 'n': 24, 'p': 9, 'r': 16, 's': 10, 't': 20, 'v': 1, 'w': 1, 'x': 1, 'y': 1}

String two consonants:
{'b': 1, 'c': 10, 'd': 14, 'f': 1, 'g': 2, 'h': 2, 'l': 14, 'm': 3, 'n': 18, 'p': 12, 'r': 19, 's': 19, 't': 10, 'v': 1, 'x': 1, 'y': 4}

Difference:
{'b': 0, 'c': 2, 'd': 3, 'f': 5, 'g': 4, 'h': 7, 'l': 8, 'm': 1, 'n': 6, 'p': 3, 'r': 3, 's': 9, 't': 10, 'v': 0, 'w': 1, 'x': 0, 'y': 3}
Total differences:
65


In [23]:
"""
Ideas for normalization (0-1 range for similarity between strings)

"X" is some value or values, D is some value or values.

1 = same string = X - diff / D = 1
0 = completely difference, only differences = X - diff / D = 0

0.5 = half same

...
Maybe something like:

(# cons str1 + # cons st2) - diff / (# cons str1 + # cons st2)

"""

'\nIdeas for normalization (0-1 range for similarity between strings)\n\n"X" is some value or values, D is some value or values.\n\n1 = same string = X - diff / D = 1\n0 = completely difference, only differences = X - diff / D = 0\n\n0.5 = half same\n\n...\nMaybe something like:\n\n(# cons str1 + # cons st2) - diff / (# cons str1 + # cons st2)\n\n'

In [24]:
def get_perc_normalized(str1,str2):
    cDict1 = alpha_sort_dict(get_c_dict(str1))
    cDict2 = alpha_sort_dict(get_c_dict(str2))
    diffDict = alpha_sort_dict(get_diff_dict(str1,str2))
    
    total_cons = get_int_dict_count(cDict1) + get_int_dict_count(cDict2)
    total_diff = get_int_dict_count(diffDict)
    
    perc = abs((total_cons - total_diff) / (total_cons))
    
    return perc

In [25]:
get_perc_normalized(str1,str2)

0.7547169811320755

In [26]:
from transliterate import translit
engSample = "Guided by the purposes and principles of the Charter of the United Nations, and expressing in particular the need to achieve international cooperation in promoting and encouraging respect for human rights and fundamental freedoms for all without distinction"
rusSample = "Руководствуясь целями и принципами Устава Организации Объединенных Наций и выражая в частности необходимость достижения международного сотрудничества "

# transliterate sample
translitRus = []
for i in rusSample.split():
    translitRus.append(translit(i,'ru',reversed=True))
# string
rusStr = (" ").join(translitRus)
# get only consonants
#rus_dict = get_c_dict(("").join(translitRus))
rusStr = (" ").join(only_consonants(rusStr))

get_perc_normalized(engSample,rusStr) # TOO HIGH

0.6126126126126126