### Exercise : False Friends

Text: Extraction of similar or identical words (as strings) but with different semantics

In [1]:
import re
import os.path
from nltk.corpus import wordnet as wn
from Levenshtein import ratio, hamming
import fasttext
import fasttext.util
import numpy as np
from numpy.linalg import norm
 

#### 1. Loading of list of words 

In [2]:
path_corpus_ita = f'resource/ita/current_version_morph-it/morph-it_048_utf-8.txt'
path_corpus_eng = f'resource/eng/BNC_lemmafile5.txt'

path_words_ita = f'resource/ita/words_ita.txt'
path_words_eng = f'resource/eng/words_eng.txt'


pos_it = ['ADJ', 'ADV', 'ASP', 'AUX', 'CAU', 'MOD', 'NOUN', 'VER']
words_ita = []
pos_eng = ['j', 'v', 'n', 'r']
words_eng = []

In [3]:
# 1.1 Extraction of words from corpora ITA - Morph-It! (15k words)

if not os.path.exists(path_words_ita) and os.path.exists(path_corpus_ita):
    with open(path_corpus_ita, 'r', encoding='utf-8') as morph_it_file:
        reader = morph_it_file.readlines()
        for row in reader:
            text, lemma, pos = re.split('\t+', row)
            if len(lemma) > 2 and lemma not in words_ita:
                pos = pos.replace('\n', '')
                pos = pos.split(":")[0]
                if pos in pos_it:
                    words_ita.append(lemma)

    with open(path_words_ita, 'w', encoding='utf-8') as words_ita_file:
        for word in words_ita:
            words_ita_file.write(word + '\n')

else: 
    with open(path_words_ita, 'r', encoding='utf-8') as words_ita_file:
        reader = words_ita_file.readlines()
        for row in reader:
            words_ita.append(row.replace('\n', ''))


print("Italian words loaded ✔ (", len(words_ita), ")")

Italian words loaded ✔ ( 16300 )


In [4]:
# 1.2 Extraction of words from corpora ENG - The British National Corpus (BNC) extracted from WordSmith Tools


if not os.path.exists(path_words_eng) and os.path.exists(path_corpus_eng):
    with open(path_corpus_eng, 'r', encoding='utf-8') as corpus_eng_file:
        reader = corpus_eng_file.readlines()
        for row in reader:
            lemma = row.split(" -> ")[0].lower()
            if len(lemma) > 2 and lemma not in words_eng:
                words_eng.append(lemma)

    with open(path_words_eng, 'w', encoding='utf-8') as words_eng_file:
        for word in words_eng:
            words_eng_file.write(word + '\n')

else: 
    with open(path_words_eng, 'r', encoding='utf-8') as words_eng_file:
        reader = words_eng_file.readlines()
        for row in reader:
            words_eng.append(row.replace('\n', ''))

print("English words loaded ✔ (", len(words_eng), ")")

English words loaded ✔ ( 20280 )


#### 2. Extraction of pairs of false-friends words from words_eng and words_ita

In [5]:

def semantic_similarity(word_eng: str, word_ita: str) -> float:
    synsets_eng = wn.synsets(word_eng)
    synsets_ita = wn.synsets(word_ita, lang='ita')
    similarities = []
    if len(synsets_eng) > 0 and len(synsets_ita) > 0:
        for synset_eng in synsets_eng:
            for synset_ita in synsets_ita:
                similarities.append(wn.wup_similarity(synset_eng, synset_ita))
    
    return sum(similarities)/len(similarities) if len(similarities) > 0 else 1

def char_similarity(word_eng: str, word_ita: str) -> float:
    sim = 0
    if len(word_eng) > 0 and len(word_ita) > 0:
        if hamming(word_eng[:3], word_ita[:3]) <= 1: # check first 3 chars
            sim = ratio(word_eng, word_ita)
    return sim

In [6]:
def extraction_pairs(words_eng: list, words_ita: list) -> list:
    pairs = []
    for word_eng in words_eng:
        for word_ita in words_ita:
            if char_similarity(word_eng, word_ita) > 0.7 and semantic_similarity(word_eng, word_ita) < 0.3: 
                pairs.append((word_eng, word_ita))
    return pairs

pairs = extraction_pairs(words_eng, words_ita)

print("Pairs of false-friends words extracted ✔ (", len(pairs), ")")

Pairs of false-friends words extracted ✔ ( 20420 )


In [7]:
# 3. Save pairs of false-friends words in a file
path_false_friends = f'resource/result_false_friends.txt'
with open(path_false_friends, 'w', encoding='utf-8') as false_friends_file:
    for pair in pairs:
        false_friends_file.write(pair[0] + '\t' + pair[1] + '\n')

print("Pairs of false-friends words saved ✔")

Pairs of false-friends words saved ✔


In [10]:
# BONUS: similarity between words using wordembeddings -> FastText

fasttext.FastText.eprint = lambda x: None
fasttext.util.download_model('it', if_exists='ignore')
fasttext.util.download_model('en', if_exists='ignore')
model_eng = fasttext.load_model('cc.en.300.bin')
model_ita = fasttext.load_model('cc.it.300.bin')

def cosine_sim(u, v):
    with np.errstate(invalid='ignore', divide='ignore'):
        return np.dot(u, v) / (norm(u) * norm(v))

def semantic_similarity_fasttext(word_eng: str, word_ita: str) -> float:
    return cosine_sim(model_eng.get_word_vector(word_eng),(model_ita.get_word_vector(word_ita)))

def extraction_pairs(words_eng: list, words_ita: list) -> list:
    pairs = []
    for word_eng in words_eng:
        for word_ita in words_ita:
            if char_similarity(word_eng, word_ita) > 0.7 and semantic_similarity_fasttext(word_eng, word_ita) < 0.3: 
                pairs.append((word_eng, word_ita))
    return pairs

pairs_emb = extraction_pairs(words_eng, words_ita)

# 3. Save pairs of false-friends words in a file
path_false_friends_emb = f'resource/result_false_friends_embeddings.txt'
with open(path_false_friends_emb, 'w', encoding='utf-8') as false_friends_file:
    for pair in pairs_emb:
        false_friends_file.write(pair[0] + '\t' + pair[1] + '\n')

print("Pairs of false-friends words using EMBEDDINGS saved ✔")



Pairs of false-friends words using EMBEDDINGS saved ✔
