In [None]:
import re

# Define a function to extract the قافية (rhyme) from a poem or verse
def extract_qafiya(verse):
    # Step 1: Normalize the verse (strip whitespaces)
    verse = verse.strip()

    # Step 2: Split the verse into words and focus on the last word
    last_word = verse.split()[-1]

    # Step 3: Define common suffixes to ignore (e.g., feminine ending, silent h, tanween)
    ignore_suffixes = ['ة', 'ه', 'ًا', 'ٌ', 'ٍ', 'ْ', 'ً', 'ٌ', 'ٍ', 'ا', 'ي', 'و']

    # Step 4: Remove any irrelevant suffixes that don’t count as part of the rhyme
    for suffix in ignore_suffixes:
        if last_word.endswith(suffix):
            last_word = last_word[:-len(suffix)]

    # Step 5: Now, look for the final consonant (الروي) and preceding vowel (ردف)
    # We search for the last consonant
    match = re.search(r'[بتثجحخدذرزسشصضطظعغفقكلمن]', last_word)

    # If a consonant (الروي) is found
    if match:
        # Include the rest of the word from the found consonant onward (including any preceding vowels)
        return last_word[match.start():]
    else:
        return last_word  # In case the word does not follow typical patterns

# Example test cases with Arabic verses
poems = [
    "السماء زرقاء.",
    "النسيم عليل.",
    "الرمال ذهبية.",
    "الليل جميل."
]

# Apply the extraction function to each verse and display the results
for poem in poems:
    rhyme = extract_qafiya(poem)
    print(f"Poem: {poem}\nExtracted قافية (Rhyme): {rhyme}\n")


Poem: السماء زرقاء.
Extracted قافية (Rhyme): زرقاء.

Poem: النسيم عليل.
Extracted قافية (Rhyme): عليل.

Poem: الرمال ذهبية.
Extracted قافية (Rhyme): ذهبية.

Poem: الليل جميل.
Extracted قافية (Rhyme): جميل.



1. The rhyme (الروي) is the last consonant in each verse, and if this consonant is consistent across all verses while the preceding letter may vary, the last consonant is considered the rhyme.


2. If the last consonant in a verse ends with a kasra (كسرة), we can conclude that this consonant is the rawi. Additionally, the presence of kasra often implies that the letter yaa (يا) is acting as the wasl (the connecting vowel). This pattern will repeat throughout the rest of the poem, where the same consonant with a kasra and yaa follows consistently.


In [None]:
import unicodedata

# Helper function to normalize Arabic letters and harakat
def normalize_arabic(text):
    # Normalize diacritics
    return unicodedata.normalize('NFC', text)

# Function to extract rhyme from verses
def extract_rhyme(verses):
    # Step 1: Extract the last word of each verse
    last_words = [verse.strip().split()[-1] for verse in verses]

    # Step 2: Normalize harakat for comparison
    last_words = [normalize_arabic(word) for word in last_words]

    # Step 3: Reverse the last words to compare from last letter
    reversed_words = [word[::-1] for word in last_words]

    # Step 4: Compare letters starting from the last letter
    rhyme = []
    for i in range(min(len(word) for word in reversed_words)):
        # Take the current letter from each reversed word
        current_letters = [word[i] for word in reversed_words]

        # Check if all letters match
        if all(letter == current_letters[0] for letter in current_letters):
            rhyme.append(current_letters[0])  # Add the matching letter to the rhyme
        else:
            break  # Mismatch found, stop

    # Step 5: Reverse the rhyme back to correct order
    rhyme = ''.join(rhyme[::-1])

    return rhyme

# Example usage
verses = [
    "إني لأعشقك حتى الحين",
    "ما دمت بحبك أشعر بالأمين",
    "وهذا حبنا فيه اليقين"
]

rhyme = extract_rhyme(verses)
print(f"The rhyme is: {rhyme}")


The rhyme is: ين


In [None]:
import unicodedata

# Helper function to normalize Arabic letters and harakat (diacritics)
def normalize_arabic(text):
    # Normalize diacritics and letters
    return unicodedata.normalize('NFC', text)

# Function to split poem into verses
def split_poem(poem):
    # Split the poem by lines
    verses = poem.strip().split('\n')
    return [verse.strip() for verse in verses]

# Function to extract rhyme from verses
def extract_rhyme(verses):
    # Step 1: Extract the last word of each verse
    last_words = [verse.split()[-1] for verse in verses]

    # Step 2: Normalize harakat for comparison
    last_words = [normalize_arabic(word) for word in last_words]

    # Step 3: Reverse the last words to compare from last letter
    reversed_words = [word[::-1] for word in last_words]

    # Step 4: Compare letters starting from the last letter
    rhyme = []
    for i in range(min(len(word) for word in reversed_words)):
        # Take the current letter from each reversed word
        current_letters = [word[i] for word in reversed_words]

        # Check if all letters match
        if all(letter == current_letters[0] for letter in current_letters):
            rhyme.append(current_letters[0])  # Add the matching letter to the rhyme
        else:
            break  # Mismatch found, stop

    # Step 5: Reverse the rhyme back to correct order
    rhyme = ''.join(rhyme[::-1])

    return rhyme

# Example poem input
poem = """
أَرى جارَتي خَفَّت وَخَفَّ نَصيحُها
وَحُبَّ بِها لَولا النَوى وَطُموحُها
فَبيني على نَجمٍ شَخيسٍ نُحوسُهُ
وَأَشأَمُ طَيرِ الزاجِرينَ سَنيحُها
فَإِن تَشغَبي فَالشَغَبُ مِنّي سَجِيَّةٌ
إِذا شيمَتي لَم يُؤتَ مِنها سَجيحُها
أُقارِضُ أَقواماً فَأوفي قُروضَهُم
وَعَفٌّ إِذا أَردى النُفوسَ شَحيحُها
"""

# Split the poem into verses
verses = split_poem(poem)

# Extract the rhyme from the verses
rhyme = extract_rhyme(verses)
print(f"The rhyme is: {rhyme}")


The rhyme is: 


In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("faisalq/bert-base-arapoembert")
model = AutoModelForMaskedLM.from_pretrained("faisalq/bert-base-arapoembert")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/516k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/588 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Tokenized: ['نصيح', '##ها']
Tokenized: ['وطم', '##وحها']
Tokenized: ['نحوس', '##ه']
Tokenized: ['سني', '##حها']


In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("faisalq/bert-base-arapoembert")
model = AutoModelForMaskedLM.from_pretrained("faisalq/bert-base-arapoembert")

# Example input poem
poem = """
أَرى جارَتي خَفَّت وَخَفَّ نَصيحُها
وَحُبَّ بِها لَولا النَوى وَطُموحُها
فَبيني على نَجمٍ شَخيسٍ نُحوسُهُ
وَأَشأَمُ طَيرِ الزاجِرينَ سَنيحُها
"""

# Split the poem into lines (verses)
verses = poem.strip().split('\n')

# Extract the last word of each verse
last_words = [verse.split()[-1] for verse in verses]

# Helper function to extract the final letters of each word
def extract_final_letters(word, num_letters=3):
    # Normalize the word (e.g., remove diacritics or special chars if needed)
    return word[-num_letters:]  # Extract the last `num_letters` letters

# Extract the last 3 letters of each last word
final_letters = [extract_final_letters(word) for word in last_words]

# Check for consistent rhymes
consistent_rhyme = set(final_letters)

# If all the final letters are the same, output the rhyme
if len(consistent_rhyme) == 1:
    rhyme = final_letters[0]
    print(f"The consistent rhyme is: {rhyme}")
else:
    print(f"Different final letters found: {final_letters}")

Different final letters found: ['ُها', 'ُها', 'ُهُ', 'ُها']


In [None]:
import unicodedata
import re
from collections import Counter

# Helper function to normalize Arabic letters and apply elongation (حركات الإشباع)
def normalize_arabic_with_elongation(word):
    # Remove diacritics (harakat) for basic normalization
    word = ''.join(c for c in unicodedata.normalize('NFD', word) if not unicodedata.combining(c))

    # Normalize characters that sound the same
    word = re.sub(r'[أإآ]', 'ا', word)  # Normalize different forms of Alef
    word = word.replace('ى', 'ا')       # Normalize 'ى' to 'ا'


    # Detect and add elongations at the end
    if word.endswith(('َ')):  # Fatha
        word += 'ا'  # Add Alef for elongation
    elif word.endswith(('ُ')):  # Damma
        word += 'و'  # Add Waw for elongation
    elif word.endswith(('ِ')):  # Kasra
        word += 'ي'  # Add Yaa for elongation

    return word

# Function to extract rhyme with majority rule to handle slight variations
def extract_rhyme_with_majority(verses):
    # Step 1: Extract the last word of each verse
    last_words = [verse.strip().split()[-1] for verse in verses]

    # Step 2: Normalize letters and apply elongation for comparison
    last_words = [normalize_arabic_with_elongation(word) for word in last_words]

    # Step 3: Reverse the last words to compare from the last letter
    reversed_words = [word[::-1] for word in last_words]

    # Step 4: Find the longest common suffix based on the majority rule
    rhyme_fragments = []
    for i in range(min(len(word) for word in reversed_words)):
        # Collect current letters at position i from each reversed word
        current_letters = [word[i] for word in reversed_words]

        # Count the frequency of each letter at the current position
        letter_count = Counter(current_letters)

        # Find the most common letter at this position
        most_common_letter, count = letter_count.most_common(1)[0]

        # If the majority agrees on this letter, add it to the rhyme fragment
        if count > len(reversed_words) // 2:  # Majority rule: at least half of the verses must agree
            rhyme_fragments.append(most_common_letter)
        else:
            break  # Mismatch found, stop

    # Step 5: Reverse the rhyme fragments back to their correct order
    rhyme = ''.join(rhyme_fragments[::-1])

    return rhyme

# Example usage
verses = [
"رى جارَتي خَفَّت وَخَفَّ نَصيحُها",
"وَحُبَّ بِها لَولا النَوى وَطُموحُها",
"وَأَشأَمُ طَيرِ الزاجِرينَ سَنيحُها"
"إِذا شيمَتي لَم يُؤتَ مِنها سَجيحُها",
"وَعَفٌّ إِذا أَردى النُفوسَ شَحيحُها",
]

rhyme = extract_rhyme_with_majority(verses)
print(f"The rhyme is: {rhyme}")



The rhyme is: يحها


TRYING

In [None]:
import unicodedata
import re
from collections import Counter

# Helper function to normalize Arabic letters and handle special rhyme cases (Kasrah to Yaa)
def normalize_arabic_with_rhyme_elongation(word):
    # Step 1: Normalize Alef variations
    word = re.sub(r'[أإآ]', 'ا', word)
    word = word.replace('ى', 'ا')

    # Step 2: Handle specific Harakat (diacritics) endings
    # Before stripping diacritics, check for specific cases
    if word.endswith('َ'):  # Ends with Fatha
        word = word[:-1] + 'ا'  # Replace with Alef
    elif word.endswith('ِ'):  # Ends with Kasrah
        word = word[:-1] + 'ي'  # Replace with Yaa
    elif word.endswith('ُ'):  # Ends with Damma
        word = word[:-1] + 'و'  # Replace with Waw

    # Step 3: Remove remaining diacritics (Harakat)
    #word = ''.join(c for c in unicodedata.normalize('NFD', word) if not unicodedata.combining(c))

    # Step 4: Ensure consistent endings for rhymes
    if word.endswith('ي'):
        word = word[:-1] + 'ي'  # Ensure ending with Yaa
    elif word.endswith('ا'):
        word = word[:-1] + 'ا'  # Ensure ending with Alef
    elif word.endswith('و'):
        word = word[:-1] + 'و'  # Ensure ending with Waw

    return word

# Function to extract rhyme with majority rule to handle slight variations
def extract_rhyme_with_majority(verses):
    # Step 1: Extract the last word of each verse
    last_words = [verse.strip().split()[-1] for verse in verses]


    # Step 2: Normalize letters and apply elongation for comparison
    last_words = [normalize_arabic_with_rhyme_elongation(word) for word in last_words]
    print(f"Normalized words: {last_words}")

    # Step 3: Reverse the last words to compare from the last letter
    reversed_words = [word[::-1] for word in last_words]
    print(f"Reversed words: {reversed_words}")

    # Step 4: Find the longest common suffix based on the majority rule
    rhyme_fragments = []
    for i in range(min(len(word) for word in reversed_words)):
        # Collect current letters at position i from each reversed word
        current_letters = [word[i] for word in reversed_words]

        # Count the frequency of each letter at the current position
        letter_count = Counter(current_letters)

        # Find the most common letter at this position
        most_common_letter, count = letter_count.most_common(1)[0]

        # If the majority agrees on this letter, add it to the rhyme fragment
        if count > len(reversed_words) // 2:  # Majority rule: at least half of the verses must agree
            rhyme_fragments.append(most_common_letter)
        else:
            break  # Mismatch found, stop
    print(f"Rhyme fragments: {rhyme_fragments}")

    # Step 5: Reverse the rhyme fragments back to their correct order
    rhyme = ''.join(rhyme_fragments[::-1])
    print(f"The rhyme is: {rhyme}")

    return rhyme

# Example usage
verses = [
    "يس المقلد في الورى كالعالم ",  # Ends with Kasrah
    "مد بن أحمد شمس هذا العالم" ,   # Ends with Yaa, but should rhyme with previous verse
]

rhyme = extract_rhyme_with_majority(verses)
print(f"The rhyme is: {rhyme}")


Normalized words: ['كالعالم', 'العالم']
Reversed words: ['ملاعلاك', 'ملاعلا']
Rhyme fragments: ['م', 'ل', 'ا', 'ع', 'ل', 'ا']
The rhyme is: العالم
The rhyme is: العالم


In [None]:
import re

# Function to clean the poem by removing symbols except for '||', '|', Arabic letters, spaces, and harakat
def clean_poem(poem):
    # Use regex to remove everything except Arabic letters, harakat (diacritics), spaces, and '||', '|'
    cleaned_poem = re.sub(r'[^\|ء-ي\sًٌٍَُِّْ]', '', poem)
    return cleaned_poem

# Example usage
poem = "قالت: سَلا وُدَّنا، وحالَ ولمْ | أسلُ فيُجري به ولم أَحُلِ || عندكِ قلبي فقلّبيه وإنْ | وجدْتِ فيه سواكِ فانتقلي"
cleaned_poem = clean_poem(poem)
print("Cleaned Poem:", cleaned_poem)


Cleaned Poem: قالت سَلا وُدَّنا وحالَ ولمْ | أسلُ فيُجري به ولم أَحُلِ || عندكِ قلبي فقلّبيه وإنْ | وجدْتِ فيه سواكِ فانتقلي


In [None]:
h=normalize_arabic_with_rhyme_elongation("العالم")

In [None]:
rev = [h[::-1]]

In [None]:
rev

['ملاعلا']