# Processing Dataset


### Functions



In [1]:
from datasets import * #suprisingly the only import needed.

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# A dictionary of arabic letters from dotted to dotless. (credit: @kaddu341)
letter_dict = {
    "ا": "ا",
    "أ": "ا",
    "إ": "ا",
    "آ": "ا",
    "ب": "ب",
    "ت": "ب",
    "ث": "ب",
    "ج": "ح",
    "ح": "ح",
    "خ": "ح",
    "د": "د",
    "ذ": "د",
    "ر": "ر",
    "ز": "ر",
    "س": "س",
    "ش": "س",
    "ص": "ص",
    "ض": "ص",
    "ط": "ط",
    "ظ": "ط",
    "ع": "ع",
    "غ": "ع",
    "ف": "ف",
    "ق": "ف",
    "ك": "ك",
    "ل": "ل",
    "م": "م",
    "ن": "ن",
    "و": "و",
    "ؤ": "و",
    "ه": "ه",
    "ة": "ه",
    "ي": "ى",
    "ى": "ى",
    "ئ": "ى",
    " ": " ",
}
letter_set = set(letter_dict.keys())


In [3]:
# Replaces multiple spaces, or whatever char designated by the perameter into one char. (helper function)
def replace(string, char):
    while char+char in string:
        string = string.replace(char+char, char)

    return string


In [4]:
# Essential function, parses arabic dotted text into dotless text. 
# Returns a dict with keys "clean" and "dotless". 
# This format is required for the datasets.map() function.

def parse_text(text):
    
  text = text["text"].strip(" ")
  
  clean_chars = [char for char in text if char in letter_set]
  clean_string = ''.join(clean_chars)

  output_chars = [letter_dict[char] for char in clean_chars]
  output_string = ''.join(output_chars)
      
  clean_string = replace(clean_string, ' ')
  output_string = replace(output_string, '_')
  
  return {"clean": clean_string, "dotless": output_string}


In [5]:
# Used for the datasets.filter() function to filter out examples/rows that have examples of less than 20 chars.
def filterEmpty(x):
    return not len(x["clean"]) < 20

### Loading, saving, and trimming Oscar (unshuffled_deduplicated_ar) dataset

In [7]:
#load the oscar unshuffled_deduplicated_ar dataset from huggingface 
datasetLoad = load_dataset("oscar", "unshuffled_deduplicated_ar") #Oscar Arabic dataset

In [8]:
#save the dataset to "ar-arrow-datasets/" Apache Arrow datafolder
datasetLoad.save_to_disk('ar-arrow-datasets')

Saving the dataset (67/67 shards): 100%|██████████| 9006977/9006977 [01:20<00:00, 112363.18 examples/s]


In [9]:
#load from file
full_dataset = load_from_disk('ar-arrow-datasets')

In [10]:
print(full_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 9006977
    })
})


### Applying maps, filters, and splits
This process of applying maps and filters is very slow especially for big datasets. 

It is possible to implement a batched approach similar ot batched tokenization that would speed it up greatly. 

In the .map() add perameter batched=True. However the function called needs to be compatible with this approach.

I will implement this soon.

Generally this is not the way to handle big data. However this is fine for the AR-dotless-small dataset

In [11]:
dataset = full_dataset['train'].select(range(100000)) #Trim to number of examples
print(dataset)

dataset = dataset.map(parse_text) #Apply the function (convert dotted to dotless)
dataset = dataset.remove_columns(["text", "id"]) #remove the "text" and "id" columns
dataset = dataset.filter(filterEmpty) #Apply the filter to remove useless empty examples.
print(dataset)


Dataset({
    features: ['id', 'text'],
    num_rows: 100000
})


Map: 100%|██████████| 100000/100000 [00:22<00:00, 4495.29 examples/s]
Filter: 100%|██████████| 100000/100000 [00:02<00:00, 42861.54 examples/s]

Dataset({
    features: ['clean', 'dotless'],
    num_rows: 99573
})





In [12]:
# test to see how many examples are under 20 characters. Goal of filterEmpty() to eliminate
a = 0
for i in dataset['clean']:
    if len(i) < 20:
        a += 1
a

0

In [13]:
dataset_clean = dataset.train_test_split(train_size= 0.8) #create 80/20 split into train and test datasets
dataset_clean

DatasetDict({
    train: Dataset({
        features: ['clean', 'dotless'],
        num_rows: 79658
    })
    test: Dataset({
        features: ['clean', 'dotless'],
        num_rows: 19915
    })
})

### Save and upload to hugging face

In [14]:
#save the cleaned dataset to "clean-arrow-datasets/" Apache Arrow datafolder
dataset_clean.save_to_disk("clean-arrow-datasets")

Saving the dataset (2/2 shards): 100%|██████████| 79658/79658 [00:00<00:00, 251503.53 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 19915/19915 [00:00<00:00, 242508.32 examples/s]


In [143]:
#required token id. Run 'huggingface-cli login'
dataset_clean.push_to_hub("dot-ammar/AR-dotless-small", private=True)

Creating parquet from Arrow format: 100%|██████████| 40/40 [00:01<00:00, 37.10ba/s]
Creating parquet from Arrow format: 100%|██████████| 40/40 [00:01<00:00, 37.47ba/s]s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 2/2 [00:29<00:00, 14.70s/it]
Deleting unused files from dataset repository: 100%|██████████| 2/2 [00:00<00:00,  8.21it/s]
Creating parquet from Arrow format: 100%|██████████| 20/20 [00:00<00:00, 36.05ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:04<00:00,  4.77s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  8.47it/s]
Downloading metadata: 100%|██████████| 622/622 [00:00<00:00, 6.41MB/s]
