In [32]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


In [10]:
def reverse_dict_with_duplicates(input_dict):
    reversed_dict = {}
    
    for key, value in input_dict.items():
        if value not in reversed_dict:
            reversed_dict[value] = [key]
        else:
            reversed_dict[value].append(key)
    
    return reversed_dict

In [11]:
letter_dict = {
    "ا": "ا",
    "أ": "ا",
    "إ": "ا",
    "آ": "ا",
    "ب": "ب",
    "ت": "ب",
    "ث": "ب",
    "ج": "ح",
    "ح": "ح",
    "خ": "ح",
    "د": "د",
    "ذ": "د",
    "ر": "ر",
    "ز": "ر",
    "س": "س",
    "ش": "س",
    "ص": "ص",
    "ض": "ص",
    "ط": "ط",
    "ظ": "ط",
    "ع": "ع",
    "غ": "ع",
    "ف": "ف",
    "ق": "ف",
    "ك": "ك",
    "ل": "ل",
    "م": "م",
    "ن": "ن",
    "و": "و",
    "ؤ": "و",
    "ه": "ه",
    "ة": "ه",
    "ي": "ى",
    "ى": "ى",
    "ئ": "ى",
    " ": " ",
}
letter_set = set(letter_dict.keys())
iLetters = reverse_dict_with_duplicates(letter_dict)

In [12]:
# Replaces multiple spaces, or whatever char designated by the perameter into one char. (helper function)
def replace(string, char):
    while char+char in string:
        string = string.replace(char+char, char)

    return string

In [13]:
def parse_text(text):
    
  text = text.strip(" ")
  
  clean_chars = [char for char in text if char in letter_set]
  clean_string = ''.join(clean_chars)

  dotless_chars = [letter_dict[char] for char in clean_chars]
  dotless_string = ''.join(dotless_chars)
      
  clean_string = replace(clean_string, ' ')
  dotless_string = replace(dotless_string, '_')
  
  return {"clean": clean_string, "dotless": dotless_string}

In [34]:
#Arabic wordlist into set
#with open('arabic-wordlist-1.6.txt', 'r') as f:
   # dictionary = set(f.read().splitlines())


In [35]:
def allVariation(root, check, i = 0 ):
    tWord = []
    word = [a for a in root]
    char = word[i]
    for dot in iLetters[char]:
        word[i] = dot
        vWord = ''.join(word)
        if i != len(word) - 1:
            tWord.extend(allVariation(vWord, check, i+1))
        else:
            if check == False:
                tWord.append(vWord)
            elif vWord in dictionary:
                tWord.append(vWord)
    return tWord

In [36]:
def find_different_indices(str1, str2):
    # Initialize an empty list to store the different indices
    different_indices = []

    # Iterate through the characters of the strings
    for i in range(min(len(str1), len(str2))):
        if str1[i] != str2[i]:
            different_indices.append(i)

    # If one string is longer than the other, consider the remaining characters as different
    for i in range(min(len(str1), len(str2)), max(len(str1), len(str2))):
        different_indices.append(i)

    return different_indices

In [37]:
translator = pipeline(model="dot-ammar/dotless_model-small")

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at dot-ammar/dotless_model-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [28]:
#translator.save_pretrained('dotless-model-small')

In [19]:
test = parse_text(" يَا أَيُّهَا الَّذِينَ آمَنُوا لَا تَسْأَلُوا عَنْ أَشْيَاءَ إِن تُبْدَ لَكُمْ تَسُؤْكُمْ وَإِن تَسْأَلُوا عَنْهَا حِينَ")
testDotless = test["dotless"]
testClean = test["clean"]
testOutput = translator(testDotless)[0]['generated_text']

print("Input (dotless): " + testDotless)
print("Target (clean):  " + testClean)
print("Output (dotted): " + testOutput)

find_different_indices(testClean, testOutput)

Input (dotless): ىا اىها الدىن امنوا لا بسالوا عن اسىا ان ببد لكم بسوكم وان بسالوا عنها حىن
Target (clean):  يا أيها الذين آمنوا لا تسألوا عن أشيا إن تبد لكم تسؤكم وإن تسألوا عنها حين
Output (dotted): يا أيها الذين آمنوا لا تسألوا عن آسيا أن ببد لكم تشوكم وأن تسألوا عنها حين


[33, 34, 38, 41, 50, 51, 56]

In [20]:
example1 = "فرا احمد الفران"
example2 = "لبسبفىد بكل حدىد"
example3 = "لدىنا عبر نرسل معلوماب الححر عبر البرىد الالكبرونى"


In [21]:
out_example1 = translator(example1)
out_example2 = translator(example2)
out_example3 = translator(example3)

print(out_example1)
print(out_example2)
print(out_example3)

[{'generated_text': 'قرأ أحمد القرآن'}]
[{'generated_text': 'لتس اشت بكل جديد'}]
[{'generated_text': 'لدينا عبر نرسل معلومات الحجر عبر البريد الإلكتروني'}]
