In [6]:
import pandas as pd
import re

# Load data
df = pd.read_csv('FAZ_POEMS_REVIEW-DONT DELETE.csv')
print(f"Loaded {len(df)} rows")
print(f"Total poems: {df['poem_ID'].nunique()}")

Loaded 11349 rows
Total poems: 353


In [7]:
# Remove rows with empty Poem_line_Raw
before = len(df)
df = df[df['Poem_line_Raw'].notna() & (df['Poem_line_Raw'].str.strip() != '')].copy()
df = df.reset_index(drop=True)
print(f"Removed {before - len(df)} empty rows")
print(f"Remaining: {len(df)} rows")

Removed 2921 empty rows
Remaining: 8428 rows


In [8]:
# Find rows with English characters (excluding X/x)
def has_english_except_x(text):
    if pd.isna(text):
        return False
    # Remove X and x firsts
    text_no_x = re.sub(r'[Xx]', '', str(text))
    # Check for any remaining English letters
    return bool(re.search(r'[A-WYZa-wyz]', text_no_x))

english_mask = (df['Poem_line_Raw'].apply(has_english_except_x) | 
                df['Title_Raw'].apply(has_english_except_x))

english_rows = df[english_mask].copy()

if len(english_rows) > 0:
    print(f"⚠ Found {len(english_rows)} rows with English characters:\n")
    print(english_rows[['ROW_ID', 'poem_ID', 'Title_Raw', 'Poem_line_Raw']].to_string(index=False))
    print(f"\n⚠ Review above - manually delete unwanted rows from CSV if needed")
    print("✓ Continuing to next cell...")
else:
    print("✓ No English characters found - approved!")

⚠ Found 3 rows with English characters:

 ROW_ID  poem_ID   Title_Raw                                   Poem_line_Raw
   9625      239 هدهد سليمان                  لـمـا تروبه Many feeling apear
   9626      239 هدهد سليمان      The night and darkness have many relations
   9627      239 هدهد سليمان والعالــــــــــــــــم نشوبه It goes and comes

⚠ Review above - manually delete unwanted rows from CSV if needed
✓ Continuing to next cell...


In [9]:
# Concatenate lines into bait (couplets) with \t separator
results = []

for poem_id in df['poem_ID'].unique():
    poem_lines = df[df['poem_ID'] == poem_id].sort_values('ROW_ID')
    title_raw = poem_lines.iloc[0]['Title_Raw']
    title_cleaned = poem_lines.iloc[0]['Title_cleaned']
    
    lines_raw = poem_lines['Poem_line_Raw'].tolist()
    lines_cleaned = poem_lines['Poem_line_cleaned'].tolist()
    
    # Pair lines into bait (2 lines per bait)
    for i in range(0, len(lines_raw), 2):
        if i + 1 < len(lines_raw):
            # Normal pair
            bait_raw = lines_raw[i] + "      " + lines_raw[i+1]
            bait_cleaned = lines_cleaned[i] + "      " + lines_cleaned[i+1]
        else:
            # Odd line (single)
            bait_raw = lines_raw[i]
            bait_cleaned = lines_cleaned[i]
        
        results.append({
            'poem_id': poem_id,
            'bait_index': i // 2 + 1,
            'title_raw': title_raw,
            'title_cleaned': title_cleaned,
            'bait_raw': bait_raw,
            'bait_cleaned': bait_cleaned
        })

bait_df = pd.DataFrame(results)

print(f"Created {len(bait_df)} bait from {len(df)} lines")
print(f"\nSample bait (first 5):\n")
for i in range(min(5, len(bait_df))):
    print(f"Poem {bait_df.iloc[i]['poem_id']} - Bait {bait_df.iloc[i]['bait_index']}:")
    print(f"  {bait_df.iloc[i]['bait_raw']}")
    print()

# Check for odd bait counts
odd_bait = bait_df[~bait_df['bait_raw'].str.contains('\t')]
if len(odd_bait) > 0:
    odd_poems = odd_bait.groupby('poem_id').size().reset_index(name='odd_bait_count')
    odd_poems = odd_poems.merge(
        bait_df.groupby('poem_id')['title_cleaned'].first().reset_index(),
        on='poem_id'
    )
    print(f"\n⚠ Found {len(odd_poems)} poems with odd lines:\n")
    print(odd_poems.to_string(index=False))
else:
    print("\n✓ All poems have even line counts - approved!")

Created 4215 bait from 8428 lines

Sample bait (first 5):

Poem 1 - Bait 1:
  حبيـبتي حالـي من الشــوق دانــي      أشــرّق بحـبّــك .. واعـوّد أغـرّب

Poem 1 - Bait 2:
  أنـانـي فْـ وصـلك لـكـن يالأنـاني      تـرى الأنانـيّـه فْـ طـبـعـك تـخــرّب

Poem 1 - Bait 3:
  لحظات حبّـك حطّـمت لي كـياني      والحزن من بين الـمحـاني تـسـرّب

Poem 1 - Bait 4:
  تـسـرّب وصـكّـت عـلـيه الـمـحاني      كـنّـه لـوصـلك بالـجـروح يْـتـقـرّب

Poem 1 - Bait 5:
  يا مْعـرّب الأنـسـاب خـدر العـياني      سِـلْـم العـرب لا تجهـله يـالـمـعـرّب


⚠ Found 353 poems with odd lines:

 poem_id  odd_bait_count                         title_cleaned
       1               6                          لا تسال مجرب
       2               9                             ثلاث مرات
       3              20                  هواء الامارات وثراها
       4              10                                  يتيم
       5              11                          مرت علي بالي
       6               8                           لا تلبس ذ

In [10]:
# Save final bait dataset
bait_df.to_csv('poems_bait_final.csv', index=False, encoding='utf-8-sig')
print(f"✓ Saved {len(bait_df)} bait to poems_bait_final.csv")

✓ Saved 4215 bait to poems_bait_final.csv


# Full Poem GROUP

In [None]:
# Load your CSV (replace 'your_file.csv' with actual path)
df = pd.read_csv('FAZ_POEMS_REVIEW.csv', sep=',')

# Group by poem_id and aggregate
grouped = df.groupby('poem_id').agg({
    'Row_ID': 'first',  # or min, doesn't matter if you just need one
    'Title_raw': 'first',
    'Poem_line_raw': '\n'.join,
    'Title_cleaned': 'first',
    'Poem_line_cleaned': '\n'.join
}).reset_index()

# Reorder columns to match original order
grouped = grouped[['poem_id', 'Row_ID', 'Title_raw', 'Poem_line_raw', 'Title_cleaned', 'Poem_line_cleaned']]

# Save to new CSV (optional)
grouped.to_csv('poems_grouped.csv', index=False)

# Display result
print(grouped)

     poem_id  Row_ID             Title_raw  \
0          1       1       لا تسـأل مجـرّب   
1          2       1         ثــلاث مـرّات   
2          3       1  هواء الإمارات وثراها   
3          4       1                  يتيم   
4          5       1         مرت على بـالي   
..       ...     ...                   ...   
348      349       1            زايد وجابر   
349      350       1                   وفا   
350      351       1                  أبـو   
351      352       1   أميرة قلوب الأنقياء   
352      353       1             إبن فرّاج   

                                         Poem_line_raw         Title_cleaned  \
0    حبيـبتي حالـي من الشــوق دانــي      أشــرّق ب...          لا تسال مجرب   
1    ثـــلاث مـرّات .. وانـت تـدوّر فـراقي      وثـ...             ثلاث مرات   
2    صاحبي يافاهمٍ قصدي ترى ما كل ديره      آتـمـشّ...  هواء الامارات وثراها   
3    يتيـم فـاقد شـوفة امّـه من سـنين      وابوه مـ...                  يتيم   
4    حبـيـبتي مـرّت على بـالـي الـيـوم      وْ 