In [1]:

import numpy as np
import tensorflow as tf


2024-06-22 07:16:02.141421: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
print("Num GPUs Available: ", tf.config.list_physical_devices('GPU'))

tf.device("/GPU:0")
print(tf.config.list_physical_devices('GPU'))

Num GPUs Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2024-06-22 07:16:03.903794: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 19414 MB memory:  -> device: 0, name: NVIDIA A10G, pci bus id: 0000:00:1e.0, compute capability: 8.6


In [3]:

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import top_k_categorical_accuracy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2

import re

In [4]:

from sklearn.model_selection import train_test_split
import re
import pickle
from collections import defaultdict


# Generate input sequences
MAX_SEQUENCE_LENGTH = 4


In [5]:
# Function to remove emojis and special characters from Arabic text
def ar_remove_emojis_and_special_characters(text):
    # Remove emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)

    text = emoji_pattern.sub(r'', text)

    # Remove special characters (retain Arabic letters and Arabic numbers)
    text = re.sub(r'[^\u0600-\u06FF0-9\s]', '', text)

    # Remove extra spaces
    text = re.sub(' +', ' ', text)

    return text

# Preprocessing pipeline
def ar_preprocess_pipeline(data: str) -> 'list':
    
    # Split by newline character
    sentences = data.replace('.', '\n').split('\n')
    
    for i in range(len(sentences)):
        sentences[i] = ar_remove_emojis_and_special_characters(sentences[i])
        sentences[i] = sentences[i].strip()
    
    # Drop empty sentences
    sentences = [s for s in sentences if len(s) > 0]
    
    # Tokenization
    tokenized = []
    for sentence in sentences:
        tokenized.append(sentence)
    return tokenized

def ar_preprocess(text):
    text = re.sub(r'[^\u0600-\u06FF0-9\s]', '', text)
    return text.split()



In [6]:

import keras

def predict_top_five_words(model, tokenizer, seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=MAX_SEQUENCE_LENGTH, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    top_five_indexes = np.argsort(predicted[0])[::-1][:5]
    top_five_words = []
    for index in top_five_indexes:
        for word, idx in tokenizer.word_index.items():
            if idx == index:
                top_five_words.append(word)
                break
    return top_five_words




# def predict_and_display_top_five_words(seed_text, model, tokenizer):

#     top_five_words = predict_top_five_words(model, tokenizer, seed_text)
#     heading_app = f"<h1>Sentence AutoCompletion App With Five Outputs</h1>"
#     output_text = f"<ul>{''.join([f'<li>{seed_text} {word}</li>' for word in top_five_words])}</ul>"
#     javascript_code = f"""
#     <script>
#         var newWindow = window.open("", "_blank");
#         newWindow.document.write('<html><head><title>Top Five Words</title></head><body>{heading_app} <br> <hr> {output_text}</body></html>');
#     </script>
#     """
#     return HTML(javascript_code)




def update_model_with_feedback(model, tokenizer, feedback_text, seed_text, learning_rate=0.001, batch_size=32, epochs=1):
    # Tokenize the feedback text
    sequence = tokenizer.texts_to_sequences([feedback_text])[0]

    # Tokenize the seed text
    seed_sequence = tokenizer.texts_to_sequences([seed_text])[0]

    # Prepare input sequence and target
    input_sequence = pad_sequences([seed_sequence], maxlen=len(seed_sequence), padding='pre')
    target = tf.keras.utils.to_categorical([sequence[-1]], num_classes=len(tokenizer.word_index) + 1)

    # Compile the model with appropriate optimizer and loss
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=learning_rate))

    # Train the model with mini-batch training
    model.fit(input_sequence, target, batch_size=batch_size, epochs=epochs, verbose=0)

    # Optionally, return history for monitoring training progress or validation loss
    return model



#Define custom metrics
# @keras.saving.register_keras_serializable()
# def top_3_accuracy(y_true, y_pred):
#     return top_k_categorical_accuracy(y_true, y_pred, k=3)

# @keras.saving.register_keras_serializable()
# def top_5_accuracy(y_true, y_pred):
#     return top_k_categorical_accuracy(y_true, y_pred, k=5)

def mean_reciprocal_rank(y_true, y_pred):
    y_true = tf.argmax(y_true, axis=1)
    y_pred = tf.argsort(y_pred, axis=1, direction='DESCENDING')
    ranks = tf.where(tf.equal(y_pred, tf.expand_dims(y_true, axis=1)))[:, 1] + 1
    rr = tf.reduce_mean(1.0 / tf.cast(ranks, tf.float32))
    return rr

@keras.saving.register_keras_serializable()
def perplexity(y_true, y_pred):
    cross_entropy = tf.keras.losses.categorical_crossentropy(y_true, y_pred)
    perplexity = tf.exp(cross_entropy)
    return tf.reduce_mean(perplexity)


In [7]:



class TrieNode:
    def __init__(self):
        self.children = defaultdict(TrieNode)
        self.is_end_of_word = False
        self.frequency = 0


class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, word, frequency=1):
        node = self.root
        for char in word:
            node = node.children[char]
        node.is_end_of_word = True
        node.frequency += frequency

    def search(self, prefix):
        node = self.root
        for char in prefix:
            if char not in node.children:
                return None
            node = node.children[char]
        return node

    def autocomplete(self, prefix):
        node = self.search(prefix)
        if not node:
            return []

        results = []
        self._dfs(node, prefix, results)
        results.sort(key=lambda x: -x[1])  # Sort by frequency
        return [word for word, freq in results]

    def _dfs(self, node, prefix, results):
        if node.is_end_of_word:
            results.append((prefix, node.frequency))
        for char, next_node in node.children.items():
            self._dfs(next_node, prefix + char, results)

    def update(self, word, frequency=1):
        self.insert(word, frequency)



def build_trie_with_frequency(data):
    trie = Trie()
    frequency_dict = defaultdict(int)
    for query in data:
        frequency_dict[query] += 1
    for query, freq in frequency_dict.items():
        trie.insert(query, freq)
    return trie


def save_trie(trie, filename):
    with open(filename, 'wb') as file:
        pickle.dump(trie, file)



def load_trie(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)



In [8]:

# # 
# import marisa_trie
# trie = marisa_trie.Trie([u'key1', u'key2', u'key12'])

# trie.prefixes(u'key12') # Find all trie keys which are prefixes of a given key:

In [9]:
# trie.keys(u'key1') # Find all trie keys which start with a given prefix:

In [10]:
import os
from tensorflow.keras.models import model_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
# Specify the filename
input_file = '../arabic_data.txt'

# Read the contents of the file
with open(input_file, 'r', encoding='utf-8') as infile:
    ar_data = infile.read()


ar_tokens = ar_preprocess(ar_data)
ar_trie = build_trie_with_frequency(ar_tokens)
save_trie(ar_trie, 'ar_trie.pkl')


In [12]:
from nltk.util import everygrams

In [13]:

ar_data = ar_data[:200000]

# Tokenize words
tokenized_sentences = ar_preprocess_pipeline(ar_data)

ar_tokenizer = Tokenizer(oov_token='<oov>')
ar_tokenizer.fit_on_texts(tokenized_sentences)
total_words = len(ar_tokenizer.word_index) + 1
# tokenizer.word_counts
# tokenizer.word_index



input_sequences = []
for line in tokenized_sentences:
    token_list = ar_tokenizer.texts_to_sequences([line])[0]
    
    
    
    # use everygrams
    input_sequences.extend(list(everygrams(token_list, max_len=MAX_SEQUENCE_LENGTH, min_len=2)))
    
    
    # for i in range(1, len(token_list)):
    #     n_gram_sequence = token_list[:i + 1]
    #     input_sequences.append(n_gram_sequence)
len(input_sequences)

100923

In [14]:
len(input_sequences[2])

4

In [15]:

# Pad sequences
max_sequence_len = MAX_SEQUENCE_LENGTH
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))



In [16]:
# turn them into tensors
X = input_sequences[:,:-1]

# turn labels (input_sequences[:,-2:-1]) into numpy array
labels = input_sequences[:,-1]

labels_encoded = keras.utils.to_categorical(labels, num_classes=total_words)


In [17]:
X

array([[   0,    0, 1618],
       [   0, 1618,  722],
       [1618,  722,    3],
       ...,
       [   0,    0, 1811],
       [   0, 1811,   18],
       [   0,    0,   18]], dtype=int32)

In [18]:
labels_encoded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [19]:

# Split data into training, validation, and test sets
X_train, X_val, y_train, y_val = train_test_split(X, labels_encoded, test_size=0.3, random_state=42)




In [20]:

# turn them into tensors
# X_train = tf.convert_to_tensor(X_train)
# X_val = tf.convert_to_tensor(X_val)


In [21]:

# y_train = tf.convert_to_tensor(y_train)
# y_val = tf.convert_to_tensor(y_val)


In [22]:
# train_dataset = tf.data.Dataset.from_tensors((X_train, y_train))
# val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
# train_dataset.save('train_dataset')
# val_dataset.save('val_dataset')
# X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.4, random_state=42)



In [23]:

# del labels
# del labels_encoded


In [24]:
# del tokenized_sentences
# del input_sequences

In [25]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in list(
                          locals().items())), key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                       y_train:  6.6 GiB
                         y_val:  2.8 GiB
               input_sequences:  1.5 MiB
                     ar_tokens: 991.9 KiB
                       X_train: 828.0 KiB
                       ar_data: 390.7 KiB
                         X_val: 354.9 KiB
           tokenized_sentences:  3.2 KiB
                           _i6:  3.0 KiB
                           _i5:  2.0 KiB


In [26]:
import keras_nlp

In [27]:
total_words

12628

In [28]:
y_train.flatten()

array([0., 0., 0., ..., 0., 0., 0.])

In [29]:
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

# X_train: (326825, 4)
# X_val: (140069, 4)
# y_train: (326825,)
# y_val: (140069,)


(70646, 3)
(30277, 3)
(70646, 12628)
(30277, 12628)


In [30]:
tf.keras.version()

'3.3.3'

In [41]:

total_words = 35000

# Define your model
ar_model = keras.Sequential()
ar_model.add(keras.layers.InputLayer(batch_size=2048, shape=(19000000, )))
ar_model.add(keras.layers.Embedding(input_dim=total_words, output_dim=128, input_length=4))
ar_model.add(keras.layers.Bidirectional(keras.layers.LSTM(256)))
ar_model.add(keras.layers.Dropout(0.2))
ar_model.add(keras.layers.Dense(total_words, activation='softmax'))

# Compile the model with multiple metrics
adam = keras.optimizers.Adam(learning_rate=0.01)
ar_model.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer=adam,
              metrics=['accuracy', keras.metrics.TopKCategoricalAccuracy(k=5), keras_nlp.metrics.Perplexity])
ar_model.summary()

In [32]:
# export TF_GPU_ALLOCATOR=cuda_malloc_async
!export TF_GPU_ALLOCATOR=cuda_malloc_async

In [33]:

# Train the model
history = ar_model.fit(X_train, y_train, 
                     batch_size=4,
                       epochs=3, validation_data=(X_val, y_val), verbose=2)


Epoch 1/3


ValueError: Argument `output` must have rank (ndim) `target.ndim - 1`. Received: target.shape=(None, 12628), output.shape=(None, 12628)

In [None]:
import matplotlib.pyplot as plt

# Plot Loss
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot Accuracy
plt.figure(figsize=(10, 5))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot Top-3 Accuracy
plt.figure(figsize=(10, 5))
plt.plot(history.history['top_3_accuracy'], label='Training Top-3 Accuracy')
plt.plot(history.history['val_top_3_accuracy'], label='Validation Top-3 Accuracy')
plt.title('Training and Validation Top-3 Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Top-3 Accuracy')
plt.legend()
plt.show()

# Plot Top-5 Accuracy
plt.figure(figsize=(10, 5))
plt.plot(history.history['top_5_accuracy'], label='Training Top-5 Accuracy')
plt.plot(history.history['val_top_5_accuracy'], label='Validation Top-5 Accuracy')
plt.title('Training and Validation Top-5 Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Top-5 Accuracy')
plt.legend()
plt.show()

# Plot Perplexity
plt.figure(figsize=(10, 5))
plt.plot(history.history['perplexity'], label='Training Perplexity')
plt.plot(history.history['val_perplexity'], label='Validation Perplexity')
plt.title('Training and Validation Perplexity')
plt.xlabel('Epochs')
plt.ylabel('Perplexity')
plt.legend()
plt.show()




In [None]:
# Save the model architecture to JSON

ar_model.save('ar_model.keras')

# ar_model_json = ar_model.to_json()
# with open("ar_model.json", "w") as json_file:
#     json_file.write(ar_model_json)

# # Save the weights to HDF5
# ar_model.save_weights("ar_model.weights.h5")

# Save the tokenizer to a file
with open('ar_tokenizer.pkl', 'wb') as handle:
    pickle.dump(ar_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
max_sequence_len

In [None]:
loaded_ar_trie = load_trie('ar_trie.pkl')
loaded_ar_trie

In [None]:
loaded_ar_trie.autocomplete('ا')[:20]

In [None]:
predict_top_five_words(ar_model, ar_tokenizer, )

In [None]:
seed_text = 'صناعة ألماس شديد الشبه بالألماس الطبيعي بواسطة الليزر وبتكلفة قل'
seed_text

In [None]:

token_list = ar_tokenizer.texts_to_sequences([seed_text])[0]
token_list


In [None]:

MAX_WORDS_IN_CONTEXT = 20 - 1

padded_token_list = pad_sequences([token_list], maxlen=MAX_WORDS_IN_CONTEXT, padding='pre')
padded_token_list

In [None]:

predicted = ar_model.predict(padded_token_list, verbose=0)[0]


In [None]:

print(predicted[::-1][:10])
print(predicted.shape)

In [None]:

top_indexes = np.argsort(predicted)[::-1][:20]

top_words = []
for index in top_indexes:
    for word, idx in ar_tokenizer.word_index.items():
        if idx == index:
            top_words.append(word)
            break

top_words