In [17]:
import pandas as pd
import re
import html

def clean_html_entities(text):
    text = str(text)
    text = re.sub(r'&lrm;|&rlm;', '', text, flags=re.IGNORECASE)
    text = html.unescape(text)
    text = text.replace('&nbsp;', ' ')
    return text

def strip_html_tags(text):
    text = str(text)
    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)  # Keep as newline
    text = re.sub(r'<[^>]+>', '', text)  # Remove other tags
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def normalize_arabic(text):
    text = str(text)
    text = re.sub(r"[إأآا]", "ا", text)
    text = re.sub(r"ى", "ي", text)
    text = re.sub(r"ؤ", "ء", text)
    text = re.sub(r"ئ", "ء", text)
    text = re.sub(r"ة", "ه", text)
    text = re.sub(r"گ", "ك", text)
    return text

def clean_for_search(text):
    """Final cleaning with placeholder for standalone diacritics."""
    text = str(text)
    
    # Replace standalone diacritics with placeholder
    text = re.sub(r'\s+([\u064B-\u065F])\s+', ' _ ', text)
    
    # Normalize Arabic letters
    text = re.sub(r"[إأآا]", "ا", text)
    text = re.sub(r"ى", "ي", text)
    text = re.sub(r"ؤ", "ء", text)
    text = re.sub(r"ئ", "ء", text)
    text = re.sub(r"ة", "ه", text)
    text = re.sub(r"گ", "ك", text)
    
    # Remove diacritics and tatweel
    text = re.sub(r'[\u0640\u064B-\u065F\u0670]', '', text)
    
    # Collapse spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [18]:
# Load Excel
file_path = "shbm_poetry.xlsx"
df = pd.read_excel(file_path, engine='openpyxl')
print(f"Loaded {len(df)} rows.")

df['Text content'] = df['Text content'].astype(str)
df['Title'] = df['Title'].astype(str)
df['ROW_ID'] = df.index + 1

invisible_chars_pattern = r'[\u200B\u200C\u200D\u200E\u200F\u2066\u2067\u2068\u2069\u202A\u202B\u202C\u202D\u202E\uFEFF]'

# Split multi-line cells on newlines (from <br /> conversion)
df['Text content'] = df['Text content'].str.split('\n')
df = df.explode('Text content').reset_index(drop=True)
df['ROW_ID'] = df.index + 1

print(f"After exploding lines: {len(df)} rows")

Loaded 11773 rows.
After exploding lines: 11773 rows


In [19]:
# Clean HTML and normalize
print("Cleaning HTML and normalizing...")

df['Text content'] = (df['Text content']
                      .apply(clean_html_entities)
                      .apply(strip_html_tags)
                      .apply(lambda x: re.sub(invisible_chars_pattern, '', x)))

df['Title'] = (df['Title']
               .apply(clean_html_entities)
               .apply(lambda x: re.sub(invisible_chars_pattern, '', x)))

df['Title'] = df['Title'].replace(['nan', 'NaN', ''], pd.NA).ffill().fillna("Unknown")

print(f"Dataset has {len(df)} rows.")
print(df.head(10))

Cleaning HTML and normalizing...
Dataset has 11773 rows.
                            Text content            Title  ROW_ID
0        حبيـبتي حالـي من الشــوق دانــي  لا تسـأل مجـرّب       1
1     أشــرّق بحـبّــك .. واعـوّد أغـرّب  لا تسـأل مجـرّب       2
2                                         لا تسـأل مجـرّب       3
3      أنـانـي فْـ وصـلك لـكـن يالأنـاني  لا تسـأل مجـرّب       4
4  تـرى الأنانـيّـه فْـ طـبـعـك تـخــرّب  لا تسـأل مجـرّب       5
5                                         لا تسـأل مجـرّب       6
6           لحظات حبّـك حطّـمت لي كـياني  لا تسـأل مجـرّب       7
7        والحزن من بين الـمحـاني تـسـرّب  لا تسـأل مجـرّب       8
8                                         لا تسـأل مجـرّب       9
9       تـسـرّب وصـكّـت عـلـيه الـمـحاني  لا تسـأل مجـرّب      10


In [20]:
# Remove only actual 'nan' text, keep empty lines (stanza breaks)
nan_mask = df['Text content'].str.lower() == 'nan'
df = df[~nan_mask].copy()

print(f"After removing nan: {len(df)} rows")

After removing nan: 11357 rows


In [21]:
# Assign poem_ID based on title groups
df['poem_ID'] = (df['Title'] != df['Title'].shift()).cumsum()

print(f"Total poems: {df['poem_ID'].nunique()}")
print(df[['ROW_ID', 'poem_ID', 'Title', 'Text content']].head(20))

Total poems: 353
    ROW_ID  poem_ID            Title                             Text content
0        1        1  لا تسـأل مجـرّب          حبيـبتي حالـي من الشــوق دانــي
1        2        1  لا تسـأل مجـرّب       أشــرّق بحـبّــك .. واعـوّد أغـرّب
2        3        1  لا تسـأل مجـرّب                                         
3        4        1  لا تسـأل مجـرّب        أنـانـي فْـ وصـلك لـكـن يالأنـاني
4        5        1  لا تسـأل مجـرّب    تـرى الأنانـيّـه فْـ طـبـعـك تـخــرّب
5        6        1  لا تسـأل مجـرّب                                         
6        7        1  لا تسـأل مجـرّب             لحظات حبّـك حطّـمت لي كـياني
7        8        1  لا تسـأل مجـرّب          والحزن من بين الـمحـاني تـسـرّب
8        9        1  لا تسـأل مجـرّب                                         
9       10        1  لا تسـأل مجـرّب         تـسـرّب وصـكّـت عـلـيه الـمـحاني
10      11        1  لا تسـأل مجـرّب      كـنّـه لـوصـلك بالـجـروح يْـتـقـرّب
11      12        1  لا تسـأل مجـرّب           

In [22]:
# Create cleaned columns with placeholder for search
df['Poem_line_cleaned'] = df['Text content'].apply(clean_for_search)
df['Title_cleaned'] = df['Title'].apply(clean_for_search)

# Rename for clarity
df = df.rename(columns={
    'Text content': 'Poem_line_Raw',
    'Title': 'Title_Raw'
})

print("\nFinal columns:")
print(df.columns.tolist())


Final columns:
['Poem_line_Raw', 'Title_Raw', 'ROW_ID', 'poem_ID', 'Poem_line_cleaned', 'Title_cleaned']


In [23]:
# Check for odd-length poems
# Exclude empty lines (stanza breaks) from counting
df_non_empty = df[df['Poem_line_cleaned'].str.len() > 0]
poem_lengths = df_non_empty.groupby('poem_ID').size().reset_index(name='length')

odd_poems = poem_lengths[poem_lengths['length'] % 2 == 1]

if len(odd_poems) > 0:
    odd_with_titles = odd_poems.merge(
        df.groupby('poem_ID')['Title_cleaned'].first().reset_index(),
        on='poem_ID'
    )
    odd_with_titles['min_ROW_ID'] = odd_with_titles['poem_ID'].map(
        df.groupby('poem_ID')['ROW_ID'].min()
    )
    
    odd_with_titles.to_csv('odd_length_poems.csv', index=False, encoding='utf-8-sig')
    print(f"Found {len(odd_poems)} odd-length poems")
    print(odd_with_titles.head(10))
else:
    print("✓ All poems have even line counts!")

Found 4 odd-length poems
   poem_ID  length        Title_cleaned  min_ROW_ID
0      180      21           مرثيه زايد        7161
1      239     121          هدهد سليمان        9589
2      252      47             عام زايد       10199
3      288      47  عجايب وشنذاره وغياث       10981


In [24]:
# Save final cleaned dataset
output_cols = ['ROW_ID', 'poem_ID', 'Poem_line_Raw', 'Title_Raw', 'Poem_line_cleaned', 'Title_cleaned']
df[output_cols].to_csv('shbm_poetry_CLEANED_FINAL.csv', index=False, encoding='utf-8-sig')

print(f"\n✓ Saved {len(df)} rows to shbm_poetry_CLEANED_FINAL.csv")
print(f"\nFinal stats:")
print(f"  Total rows: {len(df)}")
print(f"  Total poems: {df['poem_ID'].nunique()}")
print(f"  Avg lines per poem: {len(df) / df['poem_ID'].nunique():.1f}")


✓ Saved 11357 rows to shbm_poetry_CLEANED_FINAL.csv

Final stats:
  Total rows: 11357
  Total poems: 353
  Avg lines per poem: 32.2
