# Orthographic similarity classification

In [1]:
# Install dependencies
from polyglot.downloader import downloader
from polyglot.text import Text
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

from metrics import metrics
from rules import rules

## Data preprocessing

In [3]:
# !polyglot download transliteration2.ru
# !polyglot download transliteration2.bg
# !polyglot download transliteration2.mk

In [2]:
import pandas as pd


# Preprocess the dataset
def preprocess_data():
    dataset = pd.read_csv("dataset.csv")
    dataset = dataset.drop("Unnamed: 0", axis=1)
    dataset["orthographic_sim"] = dataset["false_friends"].replace({0: 1, 1: 1, 2: 0})

    word_pairs = []
    y_true = []

    for _, row in dataset.iterrows():
        pl_word = row[1]
        uk_word = row[2]

        if uk_word != "":  # Check if uk_word is not an empty string
            blob = uk_word
            text = Text(blob, hint_language_code="uk")
            if "’" in text:
                text = text.replace("’", "й")
            uk_transliterated = str(text.transliterate("en")[0])

            blob2 = pl_word
            text2 = Text(blob2, hint_language_code="pl")
            pl_transliterated = str(text2.transliterate("en")[0])

            word_pairs.append((uk_transliterated, pl_transliterated))
            y_true.append(
                row["orthographic_sim"]
            )  # Append the corresponding value to y_true

    return y_true, word_pairs


y_true, word_pairs = preprocess_data()

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.


In [3]:
# нормалізація українсько-польських пар слів
normalized = [(first, rules.remove_diacritics(second)) for first, second in word_pairs]

In [4]:
normalized

[('snidanok', 'sniadanie'),
 ('vegetariański', 'wegetarianin'),
 ('napius', 'napoj'),
 ('kawa', 'kawa'),
 ('sik', 'sok'),
 ('voda', 'woda'),
 ('pivo', 'pivo'),
 ('vino', 'wino'),
 ('sil', 'sol'),
 ('myaso', 'mieso'),
 ('riba', 'ryba'),
 ('desert', 'deser'),
 ('chas', 'chas'),
 ('godina', 'godzina'),
 ('ranok', 'rano'),
 ('den', 'dzien'),
 ('vchora', 'vchoraj'),
 ('tijden', 'tydzien'),
 ('mixiac', 'miesiac'),
 ('rik', 'rok'),
 ('ponedilok', 'poniedzialek'),
 ('vivtorok', 'wtorek'),
 ('sereda', 'sroda'),
 ('chetver', 'czwartek'),
 ('pyatnica', 'piatek'),
 ('subota', 'sobota'),
 ('nedila', 'niedziela'),
 ('sichen', 'stychen'),
 ('luty', 'luty'),
 ('quiten', 'kwiecien'),
 ('cherven', 'cherwiec'),
 ('lipen', 'lipiec'),
 ('serpna', 'sierpien'),
 ('veresen', 'wrzesien'),
 ('listopad', 'listopad'),
 ('gruden', 'grudzien'),
 ('vesna', 'wiosna'),
 ('lito', 'lato'),
 ('osin', 'jesien'),
 ('tak', 'tak'),
 ('dakuu', 'dziekuje'),
 ('pereproshuu', 'przeprasham'),
 ('dobranich', 'dobranoc'),
 ('a', 'j

In [5]:
modified_list = [
    (
        rules.replace_suffix(
            rules.ere(
                rules.replace_ch_with_cz(
                    rules.replace_suffix(
                        rules.replace_sh_with_sz(rules.replace_uwa_with_owa(t[0]))
                    )
                )
            ).replace("v", "w")
        ),
        rules.replace_sh_with_sz(rules.replace_ch_with_cz(t[1])).replace("v", "w"),
    )
    for t in normalized
]

## Similarity metrics

### Gestalt pattern matching

In [6]:
# Make predictions
similarities = []
sims = []

for word_pair in modified_list:

    similarity_value = metrics.similar(word_pair[0], word_pair[1])

    # print(similarity_value, word_pair[0], word_pair[1])

    sims.append(similarity_value)
    if similarity_value >= 0.4:
        similarities.append(1)
    else:
        similarities.append(0)

print(classification_report(y_true, similarities))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       438
           1       0.94      0.96      0.95       438

    accuracy                           0.95       876
   macro avg       0.95      0.95      0.95       876
weighted avg       0.95      0.95      0.95       876



### Jaro similarity

In [8]:
# Make predictions
similarities = []

for word_pair in modified_list:

    similarity_value = metrics.jaro_distance(word_pair[0], word_pair[1])

    if similarity_value >= 0.6:
        similarities.append(1)
    else:
        similarities.append(0)

print(classification_report(y_true, similarities))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94       438
           1       0.94      0.94      0.94       438

    accuracy                           0.94       876
   macro avg       0.94      0.94      0.94       876
weighted avg       0.94      0.94      0.94       876



In [9]:
confusion_matrix = confusion_matrix(y_true, similarities)

In [11]:
confusion_matrix = confusion_matrix(y_true, similarities)
ax = plt.subplot()
sns.heatmap(confusion_matrix,annot=True,fmt='.0f',cmap='Greys')

# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion matrix') 
ax.xaxis.set_ticklabels(['non-homographs', 'homographs'])
ax.yaxis.set_ticklabels(['non-homographs', 'homographs'])

TypeError: 'numpy.ndarray' object is not callable

### Edit distance

In [12]:
# Make predictions
similarities = []

for word_pair in modified_list:

    similarity_value = metrics.levenshtein_distance(word_pair[0], word_pair[1])

    # print(similarity_value, word_pair[0], word_pair[1])

    if similarity_value <= 3:
        similarities.append(1)
    else:
        similarities.append(0)

print(classification_report(y_true, similarities))

              precision    recall  f1-score   support

           0       0.83      0.97      0.90       438
           1       0.96      0.81      0.88       438

    accuracy                           0.89       876
   macro avg       0.90      0.89      0.89       876
weighted avg       0.90      0.89      0.89       876



### Tversky index 

In [14]:
# Make predictions
similarities = []

for word_pair in modified_list:

    similarity_value = metrics.tversky_index(word_pair[0], word_pair[1])

    # print(similarity_value, word_pair[0], word_pair[1])

    if similarity_value >= 0.5:
        similarities.append(1)
    else:
        similarities.append(0)

print(classification_report(y_true, similarities))

              precision    recall  f1-score   support

           0       0.93      0.88      0.91       438
           1       0.89      0.94      0.91       438

    accuracy                           0.91       876
   macro avg       0.91      0.91      0.91       876
weighted avg       0.91      0.91      0.91       876



### Simple Matching Coefficient

In [15]:
# Make predictions
similarities = []

for word_pair in modified_list:

    similarity_value = metrics.simple_matching_coefficient(word_pair[0], word_pair[1])

    # print(similarity_value, word_pair[0], word_pair[1])

    if similarity_value >= 0.6:
        similarities.append(1)
    else:
        similarities.append(0)

print(classification_report(y_true, similarities))

              precision    recall  f1-score   support

           0       0.93      0.88      0.91       438
           1       0.89      0.94      0.91       438

    accuracy                           0.91       876
   macro avg       0.91      0.91      0.91       876
weighted avg       0.91      0.91      0.91       876



### Dice's coefficient

In [16]:
# Make predictions
similarities = []

for word_pair in modified_list:

    similarity_value = metrics.dices_coefficient(word_pair[0], word_pair[1])

    # print(similarity_value, word_pair[0], word_pair[1])

    if similarity_value >= 0.5:
        similarities.append(1)
    else:
        similarities.append(0)

print(classification_report(y_true, similarities))

              precision    recall  f1-score   support

           0       0.93      0.88      0.91       438
           1       0.89      0.94      0.91       438

    accuracy                           0.91       876
   macro avg       0.91      0.91      0.91       876
weighted avg       0.91      0.91      0.91       876



### Overlap coefficients

In [17]:
# Make predictions
similarities = []

for word_pair in modified_list:

    similarity_value = metrics.overlap_coefficient(word_pair[0], word_pair[1])

    # print(similarity_value, word_pair[0], word_pair[1])

    if similarity_value >= 0.55:
        similarities.append(1)
    else:
        similarities.append(0)

print(classification_report(y_true, similarities))

              precision    recall  f1-score   support

           0       0.92      0.86      0.89       438
           1       0.87      0.92      0.89       438

    accuracy                           0.89       876
   macro avg       0.89      0.89      0.89       876
weighted avg       0.89      0.89      0.89       876



## Save predictions

In [19]:
dataset = pd.read_csv("preds_dataset.csv")

In [20]:
dataset

Unnamed: 0.1,Unnamed: 0,id,pl,uk,false_friends
0,0,1,śniadanie,сніданок,0
1,1,2,wegetarianin,вегетаріанський,0
2,2,3,napój,напій,0
3,3,4,kawa,кава,0
4,4,5,sok,сік,0
...,...,...,...,...,...
871,871,872,pień,обрахувати,2
872,872,873,podróżować,порушити,2
873,873,874,czyhać,порвати,2
874,874,875,dwadzieścia,сапати,2


In [22]:
dataset["predictions"] = similarities

In [24]:
dataset["similarity_values"] = sims

In [25]:
dataset

Unnamed: 0.1,Unnamed: 0,id,pl,uk,false_friends,predictions,similarity_values
0,0,1,śniadanie,сніданок,0,1,0.823529
1,1,2,wegetarianin,вегетаріанський,0,1,0.800000
2,2,3,napój,напій,0,1,0.545455
3,3,4,kawa,кава,0,1,1.000000
4,4,5,sok,сік,0,1,0.666667
...,...,...,...,...,...,...,...
871,871,872,pień,обрахувати,2,0,0.142857
872,872,873,podróżować,порушити,2,1,0.400000
873,873,874,czyhać,порвати,2,0,0.153846
874,874,875,dwadzieścia,сапати,2,0,0.235294


In [29]:
dataset = dataset.drop("Unnamed: 0", axis=1)

In [30]:
dataset

Unnamed: 0,id,pl,uk,false_friends,predictions,similarity_values
0,1,śniadanie,сніданок,0,1,0.823529
1,2,wegetarianin,вегетаріанський,0,1,0.800000
2,3,napój,напій,0,1,0.545455
3,4,kawa,кава,0,1,1.000000
4,5,sok,сік,0,1,0.666667
...,...,...,...,...,...,...
871,872,pień,обрахувати,2,0,0.142857
872,873,podróżować,порушити,2,1,0.400000
873,874,czyhać,порвати,2,0,0.153846
874,875,dwadzieścia,сапати,2,0,0.235294


In [31]:
dataset.to_csv(
    "/home/klychliiev/Desktop/Automatic_false_friends_detection_for_Ukrainian_and_Polish_languages/datasets/data_sim_values.csv"
)