# Processing Dataset


### Functions



In [34]:
from datasets import * #suprisingly the only import needed.

In [35]:
# A dictionary of arabic letters from dotted to dotless. (credit: @kaddu341)
letter_dict = {
    "ا": "ا",
    "أ": "ا",
    "إ": "ا",
    "آ": "ا",
    "ب": "ب",
    "ت": "ب",
    "ث": "ب",
    "ج": "ح",
    "ح": "ح",
    "خ": "ح",
    "د": "د",
    "ذ": "د",
    "ر": "ر",
    "ز": "ر",
    "س": "س",
    "ش": "س",
    "ص": "ص",
    "ض": "ص",
    "ط": "ط",
    "ظ": "ط",
    "ع": "ع",
    "غ": "ع",
    "ف": "ف",
    "ق": "ف",
    "ك": "ك",
    "ل": "ل",
    "م": "م",
    "ن": "ن",
    "و": "و",
    "ؤ": "و",
    "ه": "ه",
    "ة": "ه",
    "ي": "ى",
    "ى": "ى",
    "ئ": "ى",
    " ": " ",
}
letter_set = set(letter_dict.keys())


In [36]:
# Replaces multiple spaces, or whatever char designated by the perameter into one char. (helper function)
def replace(string, char):
    while char+char in string:
        string = string.replace(char+char, char)

    return string


In [37]:
# Essential function, parses arabic dotted text into dotless text. 
# Returns a dict with keys "clean" and "dotless". 
# This format is required for the datasets.map() function.

#unused, see below functions, also this is not batched
def parse_text(text):
    
  text = text["text"].strip(" ")
  
  clean_chars = [char for char in text if char in letter_set]
  clean_string = ''.join(clean_chars)

  dotless_chars = [letter_dict[char] for char in clean_chars]
  dotless_string = ''.join(dotless_chars)
      
  clean_string = replace(clean_string, ' ')
  dotless_string = replace(dotless_string, '_')
  
  return {"clean": clean_string, "dotless": dotless_string}


In [38]:
def parse_clean_text(example):
  
  clean = []
  
  for text in example["text"]:
    text = text.strip(" ")
    
    clean_chars = [char for char in text if char in letter_set]
    clean_string = ''.join(clean_chars)
    
    clean_string = replace(clean_string, ' ')
    clean.append(clean_string)
  
  return {"clean": clean}

In [39]:
def parse_dotless_text(example):
  dotless = []
  for text in example["clean"]:
    dotless_chars = [letter_dict[char] for char in text]
    dotless_string = ''.join(dotless_chars)
        
    dotless_string = replace(dotless_string, '_')
    dotless.append(dotless_string)
    
  return {"dotless": dotless}

In [40]:
#test
text = {"text": "ولا تنسى التسجيل معنا لتستفيد بكل جديد"}
print(text, "\n")

text = parse_clean_text(text)
print(text, "\n")

text = parse_dotless_text(text)
print(text, "\n")


{'text': 'ولا تنسى التسجيل معنا لتستفيد بكل جديد'} 

{'clean': ['و', 'ل', 'ا', '', 'ت', 'ن', 'س', 'ى', '', 'ا', 'ل', 'ت', 'س', 'ج', 'ي', 'ل', '', 'م', 'ع', 'ن', 'ا', '', 'ل', 'ت', 'س', 'ت', 'ف', 'ي', 'د', '', 'ب', 'ك', 'ل', '', 'ج', 'د', 'ي', 'د']} 

{'dotless': ['و', 'ل', 'ا', '', 'ب', 'ن', 'س', 'ى', '', 'ا', 'ل', 'ب', 'س', 'ح', 'ى', 'ل', '', 'م', 'ع', 'ن', 'ا', '', 'ل', 'ب', 'س', 'ب', 'ف', 'ى', 'د', '', 'ب', 'ك', 'ل', '', 'ح', 'د', 'ى', 'د']} 



In [41]:
# Used for the datasets.filter() function to filter out examples/rows that have examples of less than 20 chars.
#def filterEmpty(x):
#    return not len(x['clean']) < 20

#Not used, lambda function used instead

### Loading, saving, and trimming Oscar (unshuffled_deduplicated_ar) dataset

In [42]:
#load the oscar unshuffled_deduplicated_ar dataset from huggingface 
#datasetLoad = load_dataset("oscar", "unshuffled_deduplicated_ar") #Oscar Arabic dataset

In [43]:
#save the dataset to "ar-arrow-datasets/" Apache Arrow datafolder
#datasetLoad.save_to_disk('ar-arrow-datasets')

In [44]:
#load from file
full_dataset = load_from_disk('/Users/ammar/Developer/git-repos/dotless/data/ar-arrow-datasets')

In [45]:
print(full_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 9006977
    })
})


### Applying maps, filters, and splits
This process of applying maps and filters is very slow especially for big datasets. 

It is possible to implement a batched approach similar ot batched tokenization that would speed it up greatly. 

In the .map() add perameter batched=True. However the function called needs to be compatible with this approach.

I will implement this soon.

Generally this is not the way to handle big data. However this is fine for the AR-dotless-small dataset

In [46]:
# Gets the index of the #th space.
def num_word(text, num):
    space_count = 0
    index = -1
    for i in range(num):
        index = text.find(' ', index + 1)
    return index

In [47]:
import math
exampleToWordsLen = [] # just to keep track of how many chunks an old  made

In [48]:
print(exampleToWordsLen[:5])

[]


In [61]:

# Chunks examples into smaller examples, at @words_per_chuck words each
def chunk_examples(examples, words_per_chunk=12): #previously 8
    chunks = []
    for sentence in examples["clean"]:
        words = sentence.split()
        num_words = len(words)
        exampleToWordsLen.append(math.ceil(num_words/words_per_chunk)) # just to keep track of how many chunks an old example made
        start = 0
        while start < num_words:
            end = min(start + words_per_chunk, num_words)
            chunk = ' '.join(words[start:end])
            chunks.append(chunk)
            start = end
    
    return {"clean": chunks}

In [62]:
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    #concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    concatenated_examples = {k: ''.join(examples[k]) for k in examples.keys()}

    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [63]:
dataset = full_dataset['train'].select(range(200000)) #Trim to number of examples #previously 100,000
dataset = dataset.remove_columns(["id"])
print(dataset, '\n')

dataset = dataset.filter(lambda x: not len(x['text']) < 20) #Apply the filter to remove useless empty examples.
print(dataset, '\n')
        
dataset = dataset.map(parse_clean_text, batched=True, remove_columns="text") #Apply the function (convert dotted to dotless)
set_state_1 = dataset["clean"][0]
print(dataset, '\n')

# test to see how many examples are under 20 characters. Goal of filterEmpty() to eliminate
a = 0
for i in dataset['clean']:
    if len(i) < 40:
        a += 1
print(a, '\n')

dataset = dataset.map(chunk_examples, batched=True, remove_columns=dataset.column_names) # applies the chuncking
#dataset = dataset.map(group_texts, batched=True, num_proc=4, remove_columns=dataset.column_names) 

set_state_2 = dataset["clean"][0:exampleToWordsLen[0]]
#set_state_2 = dataset["clean"]

print(dataset, '\n')

dataset = dataset.map(parse_dotless_text, batched=True) #Apply the function (convert dotted to dotless)
set_state_3 = dataset["dotless"][0:exampleToWordsLen[0]]
#set_state_3 = dataset["dotless"]

print(dataset, '\n')

dataset = dataset.filter(lambda x: not len(x['clean']) < 20) #Apply the filter to remove useless empty examples.
print(dataset, '\n')

Dataset({
    features: ['text'],
    num_rows: 200000
}) 

Dataset({
    features: ['text'],
    num_rows: 200000
}) 

Dataset({
    features: ['clean'],
    num_rows: 200000
}) 

586 



Map:   0%|          | 0/200000 [00:00<?, ? examples/s]

Dataset({
    features: ['clean'],
    num_rows: 6108967
}) 



Map:   0%|          | 0/6108967 [00:00<?, ? examples/s]

Dataset({
    features: ['clean', 'dotless'],
    num_rows: 6108967
}) 



Filter:   0%|          | 0/6108967 [00:00<?, ? examples/s]

Dataset({
    features: ['clean', 'dotless'],
    num_rows: 6060646
}) 



In [64]:
print(set_state_1,"\n\n")
print(set_state_2,"\n\n")
print(set_state_3)


مرحبا بك عزيز الزائر نتمنى لك أوقاتا سعيدة معنا وأن نزداد شرفا بخدمتك ولا تنسى التسجيل معنا لتستفيد بكل جديدأهلا وسهلا بك زائرنا الكريم أنت لم تقم بتسجيل الدخول بعد يشرفنا أن تقوم بالدخول أو التسجيل إذا رغبت بالمشاركة في المنتدىنرحب بكل الزائرين ونتمى لكم قضا وقت ممتع معنا يملأه الحب والود والاستفادة المتبادلة بيننا علميا وعمليا يسعدنا تسجيلكم معنا ومشاركتنا وشعارنا دوما نحب الخير لكل الناس مهما اختلفت الألوان والديانات والأجناس لي أربع شقيقات أنا أكثرهن غنى لكن لا أدري لماذا يأتي أقاربي لزيارة أخواتي بكثرةوحينما يأتي موعد زيارتي لا يأتي سوى القليلفهم يزورون أخواتي الأربع كل يومأماأنا أكثر أخواتي عطا لمن يأتيني لا أتهم أخواتي بالتقصير أبدا ولكن الكل يعرف أني أكثرهن عطاكثيرون ينصحون أقاربي بأن يأتوني فلدي خير كثير وأعطي بكرم من يأتيني ومع ذلك يبتعدون عني فلا حياة لمن تنادي اختر منتدىقناة عجباوي التلفزيونية التسويقية البرامج التلفزيونية لقناة عجباوي التسويقية تعرف على عجباوي سيرة ذاتية سابقة أعمال أرشيف تصميمات عجباوي إعلانات الشركات والمحلات أغلفة كتب تنسيقات المتن الداخلي للكتب الدفاتر

In [None]:
# test to see how many examples are under 20 characters. Goal of filterEmpty() to eliminate
a = 0
for i in dataset['clean']:
    if len(i) < 20:
        a += 1
a

In [66]:
dataset_clean = dataset.train_test_split(train_size= 0.999999999) #create 80/20 split into train and test datasets
del dataset_clean['test'] # no longer using a test set
dataset_clean


DatasetDict({
    train: Dataset({
        features: ['clean', 'dotless'],
        num_rows: 6060645
    })
})

In [67]:
dataset_clean

DatasetDict({
    train: Dataset({
        features: ['clean', 'dotless'],
        num_rows: 6060645
    })
})

### Save and upload to hugging face

In [68]:
#save the cleaned dataset to "clean-arrow-datasets/" Apache Arrow datafolder
dataset_clean.save_to_disk("AR-dotless-2MediumPlus-arrow")

Saving the dataset (0/4 shards):   0%|          | 0/6060645 [00:00<?, ? examples/s]

In [None]:
#required token id. Run 'huggingface-cli login'
#dataset_clean.push_to_hub("dot-ammar/AR-dotless-2MediumPlus")