In [2]:
import re
import pandas as pd

# Input/output files
INPUT_CSV = "shbm_poetry_cleaned_ID.csv"
OUTPUT_CSV = "poems_rows.csv"

# Arabic cleaning regexes
TASHKEEL_RE = re.compile(r'[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]+')
TATWEEL_RE = re.compile(r'\u0640+')
ALEF_NORMALIZE_RE = re.compile(r'[أإآ]')
NON_ARABIC_RE = re.compile(r'[^ \t\n\r\u0621-\u064A]+')

def clean_arabic(text: str) -> str:
    if not isinstance(text, str):
        return ""
    s = text.strip()
    s = ALEF_NORMALIZE_RE.sub('ا', s)
    s = TASHKEEL_RE.sub('', s)
    s = TATWEEL_RE.sub('', s)
    s = NON_ARABIC_RE.sub('', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

# Load CSV
df = pd.read_csv(INPUT_CSV, dtype=str)

# Forward-fill Title based on poem_id
df["Title"] = df.groupby("poem_id")["Title"].transform(lambda x: x.replace("", pd.NA).ffill())

# Clean both Text and Title
df["Text_cleaned"] = df["Text content"].apply(clean_arabic)
df["Title_cleaned"] = df["Title"].apply(clean_arabic)

# Save
df.to_csv(OUTPUT_CSV, index=False)

print("✅ Cleaning complete. Saved to:", OUTPUT_CSV)


✅ Cleaning complete. Saved to: poems_rows.csv


In [2]:
import pandas as pd

df2 = pd.read_csv("shbm_poetry_cleaned_ID.csv")
df2.head()

Unnamed: 0,Text content,Title,poem_id
0,حبيـبتي حالـي من الشــوق دانــي,لا تسـأل مجـرّب,1
1,أشــرّق بحـبّــك .. واعـوّد أغـرّب,,1
2,أنـانـي فْـ وصـلك لـكـن يالأنـاني,,1
3,تـرى الأنانـيّـه فْـ طـبـعـك تـخــرّب,,1
4,لحظات حبّـك حطّـمت لي كـياني,,1


In [7]:
total_words = df2["Text content"].fillna("").apply(
    lambda x: len(str(x).split())
).sum()

print(total_words)

50230
