In [7]:
from transformers import AutoTokenizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from huggingface_hub import notebook_login

from datasets import *
import numpy as np
import re
import random
from memory_profiler import profile


In [8]:

dataset = load_from_disk('AR-dotted-mediumPlus-arrow')

In [9]:
tokenizer = AutoTokenizer.from_pretrained("AR-dotted-tokenizer")


In [10]:
test_sentance = "وجميل [MASK] الطقس"
test_sentance

'وجميل [MASK] الطقس'

In [11]:
new = allVariation(parse_text("معتدل")[1])
print(new)
print("Length:", len(new))
print("Duplicates:", len(new) != len(set(new)))


NameError: name 'allVariation' is not defined

In [12]:
def mask_random_word(input_string, seed=None):
    words = input_string.split()
    print(words)
    if seed is not None:
        random.seed(seed)

    if words:
        index = random.randint(0, len(words) - 1)
        word_to_replace = words[index]
        words[index] = "[MASK]"
        modified_string = ' '.join(words)
        return modified_string, word_to_replace
    else:
        return input_string, None

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['clean'],
        num_rows: 2980842
    })
})

In [14]:

#test_sentance = "الطقس معتدل وجميل"
test_sentance = dataset["train"][0]['clean']
seed = None
#seed = 1
masked_sentance, target = mask_random_word(test_sentance, seed=seed)
variations = allVariation(parse_text(target)[1])
print("Original Sentance:", test_sentance)
print("Masked Sentance:", masked_sentance)
print("Target:", target)
print("Variations:", variations)

print()
print()
print()

# sanity check to make sure the way arabic is messing up with 
# english in text display isn't actually how it is.
masked_Array = masked_sentance.split()
for i in range(len(masked_Array)):
    print(masked_Array[i])
print()
print()
print()

inputs_c = tokenizer(masked_sentance)
print(inputs_c)
print(tokenizer.convert_ids_to_tokens(inputs_c["input_ids"]))
print(masked_sentance)

['زكريا', 'محيي', 'الدين', 'يحيى', 'بن', 'شرف', 'النووي', 'المتوفى', 'ه', 'ج', 'ص']


NameError: name 'allVariation' is not defined

## New Dataset creation, MASKED sentance, list of possible targets, and label (target)

In [15]:
# Replaces multiple spaces, or whatever char designated by the perameter into one char. (helper function)
def replace(string, char):
    while char+char in string:
        string = string.replace(char+char, char)

    return string

def reverse_dict_with_duplicates(input_dict):
    reversed_dict = {}
    
    for key, value in input_dict.items():
        if value not in reversed_dict:
            reversed_dict[value] = [key]
        else:
            reversed_dict[value].append(key)
    
    return reversed_dict

letter_dict = {
    "ا": "ا",
    "أ": "ا",
    "إ": "ا",
    "آ": "ا",
    "ب": "ب",
    "ت": "ب",
    "ث": "ب",
    "ج": "ح",
    "ح": "ح",
    "خ": "ح",
    "د": "د",
    "ذ": "د",
    "ر": "ر",
    "ز": "ر",
    "س": "س",
    "ش": "س",
    "ص": "ص",
    "ض": "ص",
    "ط": "ط",
    "ظ": "ط",
    "ع": "ع",
    "غ": "ع",
    "ف": "ف",
    "ق": "ف",
    "ك": "ك",
    "ل": "ل",
    "م": "م",
    "ن": "ن",
    "و": "و",
    "ؤ": "و",
    "ه": "ه",
    "ة": "ه",
    "ي": "ى",
    "ى": "ى",
    "ئ": "ى",
    " ": "_",
    
}
letter_set = set(letter_dict.keys())
iLetters = reverse_dict_with_duplicates(letter_dict)

In [16]:
def uMap_parse_dotless_text(text):
  dotless_chars = [letter_dict[char] for char in text]
  dotless_string = ''.join(dotless_chars)
        
  dotless_string = replace(dotless_string, '_')
    
  return dotless_string

In [17]:
def uMap_allVariation(root, i = 0):
    tWord = []
    word = [a for a in root]
    char = word[i]
    for dot in iLetters[char]:
        word[i] = dot
        vWord = ''.join(word)
        if i != len(word) - 1:
            tWord.extend(uMap_allVariation(vWord, i+1))
        else:
            #if vWord in dictionary:
            tWord.append(vWord)
    return tWord

In [18]:
import time


In [19]:
each_n = 0

In [20]:
def fastuMap_allVariation(root):
    stack = [(list(root), 0)]
    variations = []

    while stack:
        word, i = stack.pop()
        char = word[i]

        for dot in iLetters[char]:
            word[i] = dot
            if i == len(word) - 1:
                variations.append(''.join(word))
            else:
                stack.append((list(word), i + 1))

    return variations

In [21]:
listabc123 = fastuMap_allVariation(uMap_parse_dotless_text("زكريازكريا"))
print(len(listabc123))
print(len(listabc123) == len(set(listabc123)))
listabc123

2304
True


['زكزئآزكزئا',
 'زكزئآزكزئأ',
 'زكزئآزكزئإ',
 'زكزئآزكزئآ',
 'زكزئآزكزىا',
 'زكزئآزكزىأ',
 'زكزئآزكزىإ',
 'زكزئآزكزىآ',
 'زكزئآزكزيا',
 'زكزئآزكزيأ',
 'زكزئآزكزيإ',
 'زكزئآزكزيآ',
 'زكزئآزكرئا',
 'زكزئآزكرئأ',
 'زكزئآزكرئإ',
 'زكزئآزكرئآ',
 'زكزئآزكرىا',
 'زكزئآزكرىأ',
 'زكزئآزكرىإ',
 'زكزئآزكرىآ',
 'زكزئآزكريا',
 'زكزئآزكريأ',
 'زكزئآزكريإ',
 'زكزئآزكريآ',
 'زكزئآركزئا',
 'زكزئآركزئأ',
 'زكزئآركزئإ',
 'زكزئآركزئآ',
 'زكزئآركزىا',
 'زكزئآركزىأ',
 'زكزئآركزىإ',
 'زكزئآركزىآ',
 'زكزئآركزيا',
 'زكزئآركزيأ',
 'زكزئآركزيإ',
 'زكزئآركزيآ',
 'زكزئآركرئا',
 'زكزئآركرئأ',
 'زكزئآركرئإ',
 'زكزئآركرئآ',
 'زكزئآركرىا',
 'زكزئآركرىأ',
 'زكزئآركرىإ',
 'زكزئآركرىآ',
 'زكزئآركريا',
 'زكزئآركريأ',
 'زكزئآركريإ',
 'زكزئآركريآ',
 'زكزئإزكزئا',
 'زكزئإزكزئأ',
 'زكزئإزكزئإ',
 'زكزئإزكزئآ',
 'زكزئإزكزىا',
 'زكزئإزكزىأ',
 'زكزئإزكزىإ',
 'زكزئإزكزىآ',
 'زكزئإزكزيا',
 'زكزئإزكزيأ',
 'زكزئإزكزيإ',
 'زكزئإزكزيآ',
 'زكزئإزكرئا',
 'زكزئإزكرئأ',
 'زكزئإزكرئإ',
 'زكزئإزكرئآ',
 'زكزئإزكرىا',
 'زكزئإزكرىأ',
 'زكزئإزكر

In [22]:
%load_ext memory_profiler


In [23]:
@profile
def MeMuMap_parse_dotless_text(text):
  dotless_chars = [letter_dict[char] for char in text]
  dotless_string = ''.join(dotless_chars)
        
  dotless_string = replace(dotless_string, '_')
    
  return dotless_string

In [24]:
@profile
def MeMfastuMap_allVariation(root):
    stack = [(list(root), 0)]
    variations = []

    while stack:
        word, i = stack.pop()
        char = word[i]

        for dot in iLetters[char]:
            word[i] = dot
            if i == len(word) - 1:
                variations.append(''.join(word))
            else:
                stack.append((list(word), i + 1))

    return variations

In [25]:
def helperMemTest(n, text):
    wordListsGenerated = []
    for i in range(n): 
        wordListsGenerated.append(MeMfastuMap_allVariation(MeMuMap_parse_dotless_text(text)))
    return wordListsGenerated
    


In [28]:
n = 1
prediction_n = each_n*n

print("predicted time for", n, "examples:", prediction_n, "seconds")
print("\n\n\n\n")
t0 = time.time()

words = helperMemTest(n, "زكريازكريا")
t1 = time.time()


total_n = (t1-t0)
each_n = total_n/n
print("average per example:", each_n, "seconds")
print("--------------------------------------------------------------------------")
print("predicted time for", n, "examples:", prediction_n, "seconds")
print("actual time for", n, "examples:   ", total_n, "seconds")
print("accuracy of prediction:", ((1 - abs(total_n - prediction_n)/total_n))*100, "%")

print("\n\n\n")
print("Length of gerenerated list:", len(words[0]))


predicted time for 1 examples: 0.007312774658203125 seconds





ERROR: Could not find file /var/folders/74/rsfrzb6x50l9hjwqyty2zw000000gn/T/ipykernel_22579/2064395828.py
Filename: /var/folders/74/rsfrzb6x50l9hjwqyty2zw000000gn/T/ipykernel_22579/1535695610.py

Line #    Mem usage    Increment  Occurrences   Line Contents
     1    191.8 MiB    191.8 MiB           1   @profile
     2                                         def MeMfastuMap_allVariation(root):
     3    191.8 MiB      0.0 MiB           1       stack = [(list(root), 0)]
     4    191.8 MiB      0.0 MiB           1       variations = []
     5                                         
     6    191.8 MiB      0.0 MiB        1030       while stack:
     7    191.8 MiB      0.0 MiB        1029           word, i = stack.pop()
     8    191.8 MiB      0.0 MiB        1029           char = word[i]
     9                                         
    10    191.8 MiB      0.0 MiB        4361           for dot in iLetters[char]:
    1

In [21]:
#def helper_map_allVariation(example):
 #   strings = []
 #   print(example)
  #  for text in example:
   #     print(text)
    #    strings.append(map_allVariation(text))

In [28]:
def map_mask_random_word(example):
    masked = []
    options = []
    targets = []

    for text in example["clean"]:
        words = text.split()

        index = random.randint(0, len(words) - 1)  # Generate a random index for each sentence

        for i, word in enumerate(words):
            if i == index:
                word_to_replace = words[i]
                variations = fastuMap_allVariation(uMap_parse_dotless_text(word_to_replace))
                words[i] = "[MASK]"
                modified_string = ' '.join(words)
                masked.append(modified_string)
                options.append(variations)
                targets.append(word_to_replace)
    
    return {"Masked": masked, "Options": options, "Target": targets}

In [29]:
profile()

<function memory_profiler.profile.<locals>.inner_wrapper(f)>

In [30]:
n_run_test1 = 1
a_t= 0
b_t = 1000
a_list = []
for i in range(0,n_run_test1):
    testarraylisthting = n_dataset[0:1*b]
    a_list.append(map_mask_random_word(testarraylisthting))
    a_t += 1000
    b_t += 1000


NameError: name 'b' is not defined

In [31]:
dataset

DatasetDict({
    train: Dataset({
        features: ['clean'],
        num_rows: 2980842
    })
})

In [39]:
#n_dataset= dataset
n_dataset = dataset['train'].select(range(500001)) #Trim to number of examples #previously 100,000
print(n_dataset, '\n')
print(n_dataset[0])

Dataset({
    features: ['clean'],
    num_rows: 500001
}) 

{'clean': 'زكريا محيي الدين يحيى بن شرف النووي المتوفى ه ج ص'}


In [40]:
m_dataset = n_dataset.map(map_mask_random_word, batched= True, remove_columns="clean", num_proc = 4 ) #Apply the function (convert dotted to dotless)


Map (num_proc=4):   0%|          | 0/500001 [00:00<?, ? examples/s]

In [44]:
print(m_dataset)
m_dataset[2]


Dataset({
    features: ['Masked', 'Options', 'Target'],
    num_rows: 500001
})


{'Masked': 'قبلها وساعتها لقيت حسن وابراهيم [MASK] العجلة هيسبقوني وبكده هفقد درع الحماية',
 'Options': ['غلي', 'غلى', 'غلئ', 'علي', 'على', 'علئ'],
 'Target': 'علي'}

In [42]:
dataset_resplit = m_dataset.train_test_split(train_size= 0.999999999) #create 80/20 split into train and test datasets
del dataset_resplit['test'] # no longer using a test set
dataset_resplit


DatasetDict({
    train: Dataset({
        features: ['Masked', 'Options', 'Target'],
        num_rows: 500000
    })
})

In [43]:
dataset_resplit.save_to_disk("AR-multi-dotted-Small-arrow")

Saving the dataset (0/23 shards):   0%|          | 0/500000 [00:00<?, ? examples/s]