In [49]:
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import itertools
import json

In [2]:
# XLM-ROBERTA-BASE
model_xlm_roberta_base_name = 'xlm-roberta-base'
tokenizer_xlm_roberta_base = AutoTokenizer.from_pretrained(model_xlm_roberta_base_name)
model_xlm_roberta_base = AutoModel.from_pretrained(model_xlm_roberta_base_name)

In [3]:
def get_text_embedding(text, tokenizer, model=model_xlm_roberta_base):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    
    # Get the hidden states (embeddings) from the model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Average the token embeddings (to get a single vector for the sentence)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    
    return embeddings

In [4]:
def get_cos_similarity(embed1, embed2):
    embedding1_np = embed1.numpy()
    embedding2_np = embed2.numpy()

    # Compute the cosine similarity between the two embeddings
    similarity_score = cosine_similarity(embedding1_np, embedding2_np)
#     print(f"Cosine similarity between the texts: {similarity_score[0][0]:.4f}")
    return similarity_score[0][0]

In [5]:
def get_cos_sim(text1, text2, tokenizer=tokenizer_xlm_roberta_base, model=model_xlm_roberta_base):
    embedding1 = get_text_embedding(text1, tokenizer, model)
    embedding2 = get_text_embedding(text2, tokenizer, model)
    cos_sim = get_cos_similarity(embedding1, embedding2)
    return cos_sim

In [6]:
horse_1 = 'Ат күчтүү жана ылдам.'
name_1 = "Биздин топко жаңы адамдын атын кош."

In [7]:
emb1 = get_text_embedding(horse_1, tokenizer_xlm_roberta_base, model_xlm_roberta_base)
emb2 = get_text_embedding(name_1, tokenizer_xlm_roberta_base, model_xlm_roberta_base)

In [8]:
cosine_similarity(emb1, emb2)

array([[0.99657124]], dtype=float32)

In [9]:
get_cos_similarity(emb1, emb2)

0.99657124

In [10]:
get_cos_sim(horse_1, name_1)

0.99657124

In [11]:
horse_2 = "Ат тосмодон секирип өттү."
name_2 = "Клубдун атын ким ойлоп тапты?"

In [12]:
get_cos_sim(horse_2, name_2)

0.99574625

In [15]:
horse_sentences = [
    "Ат талаада чуркап баратат.", "Кечээ мен ат миндим.", "Атты сатып алгам.",
    "Дыйкан атка жем берүүдө.", "Ат тосмодон секирип өттү.", "Аттар сулуу жаныбарлар.",
    "Аттын жалынын кылдары бекем болот.", "Ат чөп жейт, бирок жемди көбүрөөк жакшы көрөт.",
    "Балдар ат минип жүргөндү жакшы көрүшөт.", "Ат катуу кишенеди.", "Ал жаңы ат сатып алды.",
    "Ат мөңкүп жатып, чабарманды ыргытып жиберди.", "Биз жапайы ат көрдүк.", "Кара ат жарышта жеңди.",
    "Ат сарайда оонап жатат!", "Ал аттын жүнүн тарап койду.", "Ат арабаны тартып баратат.", "Алардын үч аты бар.",
    "Ат күчтүү жана ылдам.", "Ат жана чабандес бирдей кыймылдады."
]

In [16]:
test_sentences = ["Чаптырып жүргөн атын уурдатып алыптыр", "Аты ала качып кетип, жыгылган экен", "Ашка ат союлду."]

In [25]:
pairs_horse = list(itertools.combinations(horse_sentences, 2))
similarities_horse = [get_cos_sim([pair[0]], [pair[1]]) for pair in pairs_horse]
# Calculate average similarity
average_similarity_horse = sum(similarities_horse) / len(similarities_horse)
print(f"'Horse': Average Similarity Score: {average_similarity_horse}")

'Horse': Average Similarity Score: 0.9925870528346614


In [21]:
shoot_sentences = ["Сарбаз бутага так атты.", "Жаачы жебе атканга даярдады.", "Мергенчи бугуну атпай койду.",
"Ал пистолетти колго алып, ат деп буйрук берди.", "Жоокерлер душманды көргөндө ата башташты.",
"Күзөтчү абага атты.", "Мергенчи мылтык менен кушту атты.", "Ал бутага көздөй атты.", "Бала ойунчук куралдан суу атты.",
"Башаламандыктарда бир нече жолу ок атылган.", "Жакын келбегиле, атам!", "Душмандарды коркутуу үчүн асманга ок атылды.",
"Тынч эмес аймакта бир нече жолу атышуу болду.", "Күч органдары кылмышкерди атып салышкан.", "Ал ызы-чуу чыкканда абага ок атты.",
"Полиция шектүүнү аткан жок.", "Совет бийлигинин буйругу менен атууга кеткет.", "Аскерлер буйрук күтүп, атпай турду.", "Кечке бутага аттык.",
"Лагерге айдалып, андан ары атылып кеткен."]

In [27]:
pairs_shoot = list(itertools.combinations(shoot_sentences, 2))
similarities_shoot = [get_cos_sim([pair[0]], [pair[1]]) for pair in pairs_shoot]
# Calculate average similarity
average_similarity_shoot = sum(similarities_shoot) / len(similarities_shoot)
print(f"'Shoot': Average Similarity Score: {average_similarity_shoot}")

'Shoot': Average Similarity Score: 0.9943701612321953


Here we see that the avg cosine similarity is quite high for sentences close meaningfully for both categories: "Horse" and "Shoot".

In [28]:
pairs_horse_shoot = list(itertools.product(horse_sentences, shoot_sentences))
similarities_horse_shoot = [get_cos_sim([pair[0]], [pair[1]]) for pair in pairs_horse_shoot]
# Calculate average similarity
average_similarity_horse_shoot = sum(similarities_horse_shoot) / len(similarities_horse_shoot)
print(f"'Horse-Shoot': Average Similarity Score: {average_similarity_horse_shoot}")

'Horse-Shoot': Average Similarity Score: 0.9927406707406043


Also we see thath the avg similarity between sentences from different categories is pretty high. It's unexpectedly wierd.

## Use multiple signal phrases

1. We have a dictionary of homonyms. Against every meaning there are 1) example sentences 2) multiple signal phrases: ["чаптыруу", "ээр токун", "улак тартыш"]
2. Every time we get a sentence containing an ambigious word, we get its embeddings and compare against signal phrases of every meaning and see which one wins.

In [30]:
ambigious_words = {
    "ат": {
        "name": {
            "signal_phrases": ["ысым бер", "фамилия", "адам"],
        },
        "horse": {
            "signal_phrases": ["чаптыруу", "ээр токун", "улак тартыш"],
        },
        "shoot": {
            "signal_phrases": ["ок атуу", "мылтык", "автомат", "пистолет", "жаа", "аскер", "полиция"],
        }
    }
}

In [31]:
def calculate_avg_sim(sent_text, data):
    signal_phrases = data.get('signal_phrases', [])
    if not signal_phrases:
        return 0  # Return zero if there are no signal phrases
    sims = []
    for phrase in signal_phrases:
#         phrase_embedding = model.get_sentence_vector(phrase)
#         sim = cosine_similarity([sent_embedding], [phrase_embedding])[0][0]
        sim = get_cos_sim(sent_text, phrase)
        sims.append(sim)
    avg_sim = sum(sims) / len(sims)
    return avg_sim

In [32]:
def disambiguator(word, sentence):
    amb_word = ambigious_words.get(word)
    if amb_word is None:
        raise Exception(f"Word '{word}' not found in our dictionary")
    results = []
#     sent_embedding = model.get_sentence_vector(sentence)
    for meaning, data in amb_word.items():
        avg_score = calculate_avg_sim(sentence, data)
        results.append((meaning, avg_score))
    # Sort the results in descending order of average similarity
    results = sorted(results, key=lambda x: x[1], reverse=True)
    # Return the meaning with the highest average similarity score
    return results[0][0]

In [33]:
calculate_avg_sim(horse_sentences[0], ambigious_words)

0

In [44]:
n_horse = len(horse_sentences)
correct_horse = 0

for horse_sent in horse_sentences:
    answer = disambiguator('ат', horse_sent)
#     print(answer)
    if answer == 'horse':
        correct_horse += 1
print(f'"Horse" accuracy = {correct_horse / n_horse}')

"Horse" accuracy = 0.95


In [45]:
n_shoot = len(shoot_sentences)
correct_shoot = 0

for shoot_sent in shoot_sentences:
    answer = disambiguator('ат', shoot_sent)
#     print(answer)
    if answer == 'shoot':
        correct_shoot += 1
print(f'"Shoot" accuracy = {correct_shoot / n_shoot}')

"Shoot" accuracy = 0.0


Load our

In [50]:
json_file = "sentences.json"

with open(json_file, 'r', encoding='utf-8') as f:
    sentences = json.load(f)

In [54]:
json_file_ambiguous_words_50 = "ambiguous_words_50.json"

with open(json_file_ambiguous_words_50, 'r', encoding='utf-8') as f:
    ambiguous_words_50 = json.load(f)

In [51]:
sentences.keys()

dict_keys(['ач', 'же', 'ак', 'кап', 'ала', 'кеч', 'кош', 'кал', 'бай', 'сай', 'арык', 'кой', 'ай', 'топ', 'жар', 'тил', 'каз', 'там', 'жаш', 'кара', 'мал', 'сөз', 'бас', 'тек', 'уч', 'жең', 'курак', 'айт', 'түш', 'кур', 'тай', 'кол', 'күн', 'ат', 'жаз', 'кат', 'сан', 'чал', 'кир', 'чек', 'бак', 'аябай', 'ачуу'])

In [55]:
ambiguous_words_50.keys()

dict_keys(['ач', 'же', 'ак', 'кап', 'ала', 'кеч', 'кош', 'кал', 'бай', 'сай', 'арык', 'кой', 'ай', 'топ', 'жар', 'тил', 'каз', 'там', 'жаш', 'кара', 'мал', 'сөз', 'бас', 'тек', 'уч', 'жең', 'курак', 'айт', 'түш', 'кур', 'тай', 'кол', 'күн', 'ат', 'жаз', 'кат', 'сан', 'чал', 'кир', 'чек', 'бак', 'аябай', 'ачуу'])

In [63]:
set(sentences.keys()) == set(ambiguous_words_50.keys())

True

In [68]:
ambigious_words['ач']

{'open': {'signal_phrases': ['ачуу',
   'эшикти ачуу',
   'жарык',
   'кирүү',
   'жолду ач',
   'ачылыш',
   'ачылышы',
   'ачык']},
 'hungry': {'signal_phrases': ['ач карын',
   'тамакка муктаж',
   'ачтык',
   'курсак ачуу',
   'ачка болуу']}}

In [83]:
# Merge JSONs
ambigious_words = ambiguous_words_50.copy()
for amb_word, meanings in ambiguous_words_50.items():
    if set(meanings.keys()) != set(sentences[amb_word].keys()):
        print(f"containers of amb_word '{amb_word}' in json mismatch! This key is deleted.")
        del ambigious_words[amb_word]
        continue
    for meaning in meanings.keys():
        ambigious_words[amb_word][meaning]["sentences"] = sentences[amb_word][meaning]["sentences"]

containers of amb_word 'там' in json mismatch! This key is deleted.


In [90]:
# Save to json
with open('ambigious_words.json', 'w', encoding='utf-8') as f:
    json.dump(ambigious_words, f, ensure_ascii=False, indent=4)

In [75]:
# example
ambigious_words['ач']

{'open': {'signal_phrases': ['ачуу',
   'эшикти ачуу',
   'жарык',
   'кирүү',
   'жолду ач',
   'ачылыш',
   'ачылышы',
   'ачык'],
  'sentences': ['Ач дегенде ача кал да!']},
 'hungry': {'signal_phrases': ['ач карын',
   'тамакка муктаж',
   'ачтык',
   'курсак ачуу',
   'ачка болуу'],
  'sentences': ['Ач көздүгүнөн ушундай абалга келип отурбайбы']}}

In [77]:
# test
answer = disambiguator('ач', "Ач дегенде ача кал да!")
print(answer)


hungry


In [85]:
%%time

count_correct = 0
count_all = 0
n = len(ambigious_words.keys())
for amb_word, meanings in ambigious_words.items():
    print(f"{count_all} / {n}: amb_word = {amb_word}")
    for meaning, val in meanings.items():
        for sent in val["sentences"]:
            answer = disambiguator(amb_word, sent)
            count_all += 1
            if answer == meaning:
                count_correct += 1
#     print()


0 / 42: amb_word = ач
2 / 42: amb_word = же
4 / 42: amb_word = ак
6 / 42: amb_word = кап
9 / 42: amb_word = ала
12 / 42: amb_word = кеч
14 / 42: amb_word = кош
16 / 42: amb_word = кал
18 / 42: amb_word = бай
20 / 42: amb_word = сай
22 / 42: amb_word = арык
24 / 42: amb_word = кой
26 / 42: amb_word = ай
29 / 42: amb_word = топ
31 / 42: amb_word = жар
34 / 42: amb_word = тил
47 / 42: amb_word = каз
49 / 42: amb_word = жаш
52 / 42: amb_word = кара
54 / 42: amb_word = мал
56 / 42: amb_word = сөз
58 / 42: amb_word = бас
60 / 42: amb_word = тек
62 / 42: amb_word = уч
64 / 42: amb_word = жең
66 / 42: amb_word = курак
68 / 42: amb_word = айт
70 / 42: amb_word = түш
73 / 42: amb_word = кур
76 / 42: amb_word = тай
84 / 42: amb_word = кол
86 / 42: amb_word = күн
88 / 42: amb_word = ат
91 / 42: amb_word = жаз
95 / 42: amb_word = кат
108 / 42: amb_word = сан
110 / 42: amb_word = чал
114 / 42: amb_word = кир
117 / 42: amb_word = чек
119 / 42: amb_word = бак
121 / 42: amb_word = аябай
123 / 42: amb_w

In [86]:
print(f"Accuracy = {count_correct / count_all}")

Accuracy = 0.368
