In [1]:
# %pip uninstall tensorflow
# %pip install tensorflow
# %pip install keras
# %pip install gensim
# %pip install nltk
# %pip install torch
# %pip install fasttext

import re
from collections import Counter
import pandas as pd
import numpy as np
# import tensorflow as tf
import nltk, re
# from keras.preprocessing.text import Tokenizer
from datetime import datetime
from gensim.models import *
import logging
import fasttext
# from rnn_utils import *
%matplotlib inline


''' D_NAMES: This is a list containing names of various Arabic diacritics. Each
 element of the list represents a specific diacritic type. '''
D_NAMES = ['Fathatan', 'Dammatan', 'Kasratan', 'Fatha', 'Damma', 'Kasra', 'Shadda', 'Sukun']

##############################################################################################

''' NAME2DIACRITIC: This uses a dictionary comprehension to create a mapping
from diacritic names to their corresponding Unicode characters.'''
NAME2DIACRITIC = dict((name, chr(code)) for name, code in zip(D_NAMES, range(0x064B, 0x0653)))

##############################################################################################

''' DIACRITIC2NAME: This is the inverse of the previous dictionary.'''
DIACRITIC2NAME = dict((code, name) for name, code in NAME2DIACRITIC.items())

##############################################################################################

''' ARABIC_DIACRITICS: This creates a frozenset containing the Unicode
 characters of all the diacritics.'''
ARABIC_DIACRITICS = frozenset(NAME2DIACRITIC.values())


# Remove all standard diacritics from the text, leaving the letters only.
def clear_diacritics(text):
    assert isinstance(text, str)
    return ''.join([l for l in text if l not in ARABIC_DIACRITICS])


# Return the diacritics from the text while keeping their original positions.
def extract_diacritics(text):
    assert isinstance(text, str)
    diacritics = []
    classes = []
    temp = ''
    for i in range(1, len(text)):
        temp = ''
        if text[i] in ARABIC_DIACRITICS:
            if text[i-1] == NAME2DIACRITIC['Shadda']:
                diacritics[-1] = (DIACRITIC2NAME[text[i-1]], DIACRITIC2NAME[text[i]])
                temp = (DIACRITIC2NAME[text[i-1]], DIACRITIC2NAME[text[i]])
                if (temp == ('Shadda', 'Fatha')):
                    classes.pop()
                    classes.append(8)
                elif (temp == ('Shadda', 'Fathatan')):
                    classes.pop()
                    classes.append(9)
                elif (temp == ('Shadda', 'Damma')):
                    classes.pop()
                    classes.append(10)
                elif (temp == ('Shadda', 'Dammatan')):
                    classes.pop()
                    classes.append(11)
                elif (temp == ('Shadda', 'Kasra')):
                    classes.pop()
                    classes.append(12)
                elif (temp == ('Shadda', 'Kasratan')):
                    classes.pop()
                    classes.append(13)
            else:
                diacritics.append(DIACRITIC2NAME[text[i]])
                temp = DIACRITIC2NAME[text[i]]
                if (temp == 'Fatha'):
                    classes.append(0)
                elif (temp == 'Fathatan'):
                    classes.append(1)
                elif (temp == 'Damma'):
                    classes.append(2)
                elif (temp == 'Dammatan'):
                    classes.append(3)
                elif (temp == 'Kasra'):
                    classes.append(4)
                elif (temp == 'Kasratan'):
                    classes.append(5)
                elif (temp == 'Sukun'):
                    classes.append(6)
                elif (temp == 'Shadda'):
                    classes.append(7)
        elif text[i - 1] not in ARABIC_DIACRITICS:
            diacritics.append('')
            classes.append(14)

    if text[-1] not in ARABIC_DIACRITICS:
        diacritics.append('')
        classes.append(14)
    return diacritics, classes


def extract_arabic_words2(text):
    arabic_pattern = re.compile('[\u0600-\u06FF]+')
    arabic_matches = arabic_pattern.findall(text)
    result = ' '.join(arabic_matches)
    processed_text = re.sub(r'[؛،\.]+', '', result)
    final_processed_text = re.sub(r'\s+', ' ', processed_text)
    return final_processed_text


input_file_path = "train.txt"  # Replace with your input file path
with open(input_file_path, "r", encoding="utf-8") as input_file:
    input_text = input_file.read()

arabic_words = extract_arabic_words2(input_text)

output_words = clear_diacritics(arabic_words)
words = output_words.split()
words2 = arabic_words.split()
words_array = [list(word) for word in words]
words_array2 = [list(word2) for word2 in words2]

output_without_spaces = arabic_words.replace(" ", "")
output_without_spaces2 = output_words.replace(" ", "")
array_of_chars = [char for char in output_without_spaces]
_,classes_extraction = extract_diacritics (output_without_spaces)


num_feature = 30
min_word_count = 1
num_thread = 5
window_size = 10
down_sampling = 0.001
iteration = 20

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model_fastText = FastText(words_array,
                        vector_size=num_feature,
                        window=window_size,
                        min_count=min_word_count,
                        workers=num_thread)


j=0
chars =[]
char_vectors =[]
char_classes=[]
for word in words_array:
  for char in word:
    chars.append(char)
    char_classes.append(classes_extraction[j])
    vector = model_fastText.wv[char]
    char_vectors.append(vector)
    j=j+1

print (j)
print(chars[1])
print(char_classes[1])
print(char_vectors[1])


2024-01-01 15:28:57,618 : INFO : collecting all words and their counts
2024-01-01 15:28:57,620 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-01-01 15:28:57,634 : INFO : PROGRESS: at sentence #10000, processed 39789 words, keeping 37 word types
2024-01-01 15:28:57,645 : INFO : PROGRESS: at sentence #20000, processed 79504 words, keeping 37 word types
2024-01-01 15:28:57,654 : INFO : PROGRESS: at sentence #30000, processed 119409 words, keeping 37 word types
2024-01-01 15:28:57,671 : INFO : PROGRESS: at sentence #40000, processed 159213 words, keeping 37 word types
2024-01-01 15:28:57,684 : INFO : PROGRESS: at sentence #50000, processed 199177 words, keeping 37 word types
2024-01-01 15:28:57,693 : INFO : PROGRESS: at sentence #60000, processed 238890 words, keeping 37 word types
2024-01-01 15:28:57,703 : INFO : PROGRESS: at sentence #70000, processed 278484 words, keeping 37 word types
2024-01-01 15:28:57,714 : INFO : PROGRESS: at sentence #80000, proces

8353805
و
6
[-0.05952365  0.7864038  -0.30884534  0.00966583 -0.31647366 -0.37679344
  0.14028403 -0.0656407   0.3308646   0.13131635]


In [2]:
print(len(char_vectors))
print(len(char_classes))
print(char_vectors[:4])
print(char_classes[:4])


8353805
8353805
[array([-0.59912956, -2.3143928 , -0.4928575 , -0.7636472 ,  2.5471454 ,
        0.265863  ,  1.110589  , -0.86562574,  0.4167358 , -0.3797188 ],
      dtype=float32), array([-0.05952365,  0.7864038 , -0.30884534,  0.00966583, -0.31647366,
       -0.37679344,  0.14028403, -0.0656407 ,  0.3308646 ,  0.13131635],
      dtype=float32), array([ 1.2460281 ,  0.07789117,  0.35825545,  1.0349609 , -0.27964464,
       -0.87966335, -0.05130558, -0.42975643, -0.4579924 , -0.21386081],
      dtype=float32), array([ 0.38726312, -0.2088649 , -0.9019669 , -0.8579454 , -0.9554811 ,
       -0.12167442, -0.35786894,  0.23730293, -0.02445211,  0.2521064 ],
      dtype=float32)]
[0, 6, 2, 2]


In [3]:
# prompt: generate pytorch class "TashkelaSet" that inherits from Dataset that takes char_vector as input X and char_classes as labels + define a function prepare_data that takes the path for the train.txt and val.txt and returns dataloaders


# import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
# import pandas as pd
# import numpy as np
# # import tensorflow as tf
# import nltk, re
# from keras.preprocessing.text import Tokenizer
# from datetime import datetime
# from gensim.models import *
# import logging

class TashkelaSet(Dataset):
  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

def prepare_data(train_X, train_y):

  train_set = TashkelaSet(train_X, train_y)

  train_loader = DataLoader(train_set, batch_size=128, shuffle=True)

  return train_loader



**Model**

In [10]:

import torch
import torch.nn as nn

class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(SimpleRNN, self).__init__()

        # RNN layer
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)

        # Fully connected layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, h0):
        #h0 dimesions should be 2-D
        # h0 = h0.unsqueeze(0)
        #x.size(-1) must be equal to input_size
        x = x.unsqueeze(0)
        # Forward pass through the RNN
        out, hn = self.rnn(x, h0)

        # Select the output from the last time step
        out = out[:, -1, :]

        # Fully connected layer
        out = self.fc(out)

        return out, hn

#####################
def train_model(model, train_loader):
    """
    Function for training the model
    """
    # define the optimization
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    # define the loss function
    criterion = nn.CrossEntropyLoss()
    # epochs
    epochs = 10
    # loop over the epochs
    for epoch in range(epochs):
        # initialize the hidden state
        h0 = torch.zeros(1, 3, hidden_size)
        # loop over the dataset
        for inputs, labels in train_loader:
            # zero the gradients
            optimizer.zero_grad()
            # compute the model output
            yhat, h0 = model(inputs, h0)
            # calculate loss
            loss = criterion(yhat, labels)
            # credit assignment
            loss.backward()
            # update model weights
            optimizer.step()
        # print the loss
        print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')
#####################

def evaluate_model(model, test_loader):
    """
    Function for evaluating the model
    """
    # initialize the hidden state
    h0 = torch.zeros(1, 3, hidden_size)
    # initialize the accuracy
    correct = 0
    total = 0
    # deactivating autograd
    with torch.no_grad():
        # loop over the test dataset
        for inputs, labels in test_loader:
            # compute the model output
            yhat, h0 = model(inputs, h0)
            # get predictions from the maximum value
            _, predicted = torch.max(yhat.data, 1)
            # update total
            total += labels.size(0)
            # update correct
            correct += (predicted == labels).sum().item()
    # compute the accuracy
    accuracy = 100 * correct / total
    # print the accuracy
    print(f'Accuracy: {accuracy:.2f}')





In [11]:
# prompt: now test the whole code in action , train , validate/evaluate and feel free to add necessary code , that name of the train file is train.txt  and validation set is val.txt

# **Data Preparation**

train_path = 'train.txt'
val_path = 'val.txt'

train_loader = prepare_data(char_vectors, char_classes)

# **Model Definition**

input_size = len(char_vectors)
hidden_size = 128
output_size = len(char_classes)

model = SimpleRNN(input_size, hidden_size, output_size)

# **Training**

# train_model(model, train_loader)

# **Evaluation**

# evaluate_model(model, val_loader)


In [12]:
# **Training**

train_model(model, train_loader)

# **Evaluation**

# evaluate_model(model, val_loader)

RuntimeError: input.size(-1) must be equal to input_size. Expected 8353805, got 10

In [22]:
class MyRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MyRNN, self).__init__()
        self.hidden_size = hidden_size
        self.in2hidden = nn.Linear(input_size + hidden_size, hidden_size)
        self.in2output = nn.Linear(input_size + hidden_size, output_size)
    
    def forward(self, x, hidden_state):
        combined = torch.cat((x, hidden_state), 1)
        hidden = torch.sigmoid(self.in2hidden(combined))
        output = self.in2output(combined)
        return output, hidden
    
    def init_hidden(self):
        return nn.init.kaiming_uniform_(torch.empty(1, self.hidden_size))
    
hidden_size = 256
learning_rate = 0.001

model = MyRNN(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

RuntimeError: [enforce fail at alloc_cpu.cpp:80] data. DefaultCPUAllocator: not enough memory: you tried to allocate 279152786208420 bytes.

In [None]:
num_epochs = 2
print_interval = 3000

for epoch in range(num_epochs):
    random.shuffle(train_dataset)
    for i, (name, label) in enumerate(train_dataset):
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        
        if (i + 1) % print_interval == 0:
            print(
                f"Epoch [{epoch + 1}/{num_epochs}], "
                f"Step [{i + 1}/{len(train_dataset)}], "
                f"Loss: {loss.item():.4f}"
            )