In [None]:
original = pd.read_csv('./CSV/FAZ3_POEMS_Exact_Search - cleaned Full_poems.csv')
target_csv = pd.read_csv('./CSV/FAZ3_POEMS_Exact_Search - Exact_search.csv')

In [7]:
import pandas as pd

def fix_csv_order(guide_path, target_path, output_path):
    """
    Reorder target CSV rows to match guide CSV's correct poem line order.
    
    Args:
        guide_path: Path to guide CSV (has full poems in multi-line cells)
        target_path: Path to target CSV (has individual lines as rows, wrong order)
        output_path: Path to save fixed CSV
    """
    # Read CSVs
    guide_df = pd.read_csv(guide_path)
    target_df = pd.read_csv(target_path)
    
    print("="*70)
    print("ARABIC POETRY CSV LINE REORDERER")
    print("="*70)
    
    # Convert guide poem_id to int, handling non-numeric values
    guide_df['poem_id'] = pd.to_numeric(guide_df['poem_id'], errors='coerce')
    invalid_rows = guide_df['poem_id'].isna().sum()
    
    if invalid_rows > 0:
        print(f"\n⚠ Dropping {invalid_rows} guide rows with non-numeric poem_id")
        guide_df = guide_df.dropna(subset=['poem_id'])
    
    guide_df['poem_id'] = guide_df['poem_id'].astype(int)
    
    # Show statistics
    common_poems = len(set(guide_df['poem_id']) & set(target_df['poem_id']))
    print(f"\nGuide poems: {len(guide_df)}")
    print(f"Target rows: {len(target_df)}")
    print(f"Common poem_ids: {common_poems}")
    
    # Build mapping: (poem_id, line_text) -> correct_line_order
    line_order_map = {}
    
    for _, guide_row in guide_df.iterrows():
        poem_id = guide_row['poem_id']
        full_poem = str(guide_row['Poem_line_cleaned'])
        lines = [l.strip() for l in full_poem.split('\n') if l.strip()]
        
        for line_num, line in enumerate(lines, start=1):
            key = (poem_id, line)
            line_order_map[key] = line_num
    
    print(f"Mapping entries created: {len(line_order_map)}")
    
    # Assign correct line order to each target row
    def get_order(row):
        key = (row['poem_id'], row['Poem_line_cleaned'].strip())
        return line_order_map.get(key, 999999)
    
    target_df['line_order'] = target_df.apply(get_order, axis=1)
    
    # Calculate matching statistics
    matched = (target_df['line_order'] != 999999).sum()
    unmatched = (target_df['line_order'] == 999999).sum()
    
    print(f"\n{'='*70}")
    print("MATCHING RESULTS")
    print(f"{'='*70}")
    print(f"✓ Matched: {matched}/{len(target_df)} ({100*matched/len(target_df):.1f}%)")
    print(f"✗ Unmatched: {unmatched}")
    
    # Show before/after for poem 1
    test_poem_id = 1
    if test_poem_id in target_df['poem_id'].values:
        print(f"\n{'='*70}")
        print(f"POEM {test_poem_id} - BEFORE (wrong order)")
        print(f"{'='*70}")
        
        # Get original order before sorting
        original_target = pd.read_csv(target_path)
        poem1_before = original_target[original_target['poem_id'] == test_poem_id]
        for _, row in poem1_before.iterrows():
            print(f"Row_ID {row['Row_ID']}: {row['Poem_line_cleaned'][:55]}...")
    
    # Sort by poem_id first, then by correct line order
    target_df = target_df.sort_values(['poem_id', 'line_order']).reset_index(drop=True)
    
    # Renumber Row_ID globally from 1
    target_df['Row_ID'] = range(1, len(target_df) + 1)
    
    # Drop helper column
    target_df = target_df.drop('line_order', axis=1)
    
    # Save fixed CSV
    target_df.to_csv(output_path, index=False, encoding='utf-8')
    
    # Show after for poem 1
    if test_poem_id in target_df['poem_id'].values:
        print(f"\n{'='*70}")
        print(f"POEM {test_poem_id} - AFTER (correct order)")
        print(f"{'='*70}")
        poem1_after = target_df[target_df['poem_id'] == test_poem_id]
        for _, row in poem1_after.iterrows():
            print(f"Row_ID {row['Row_ID']}: {row['Poem_line_cleaned'][:55]}...")
    
    print(f"\n{'='*70}")
    print(f"✓ SUCCESS! Fixed CSV saved to: {output_path}")
    print(f"{'='*70}\n")
    
    return target_df


# Example usage
if __name__ == "__main__":
    guide_file = "./CSV/FAZ3_POEMS_Exact_Search - cleaned Full_poems.csv"
    target_file = "./CSV/FAZ3_POEMS_Exact_Search - Exact_search.csv"
    output_file = "fixed.csv"
    
    result = fix_csv_order(guide_file, target_file, output_file)

ARABIC POETRY CSV LINE REORDERER

⚠ Dropping 1 guide rows with non-numeric poem_id

Guide poems: 352
Target rows: 4214
Common poem_ids: 351
Mapping entries created: 4189


AttributeError: 'float' object has no attribute 'strip'

In [8]:
import pandas as pd

def fix_csv_order(guide_path, target_path, output_path):
    """
    Reorder target CSV rows to match guide CSV's correct poem line order.
    """
    # Read CSVs
    guide_df = pd.read_csv(guide_path)
    target_df = pd.read_csv(target_path)
    
    print("="*70)
    print("ARABIC POETRY CSV LINE REORDERER")
    print("="*70)
    
    # Convert guide poem_id to int, handling non-numeric values
    guide_df['poem_id'] = pd.to_numeric(guide_df['poem_id'], errors='coerce')
    invalid_rows = guide_df['poem_id'].isna().sum()
    
    if invalid_rows > 0:
        print(f"\n⚠ Dropping {invalid_rows} guide rows with non-numeric poem_id")
        guide_df = guide_df.dropna(subset=['poem_id'])
    
    guide_df['poem_id'] = guide_df['poem_id'].astype(int)
    
    # Check for missing values in target
    missing_lines = target_df['Poem_line_cleaned'].isna().sum()
    if missing_lines > 0:
        print(f"⚠ Warning: {missing_lines} rows in target have missing Poem_line_cleaned")
        print(f"  These rows will be placed at the end")
    
    # Show statistics
    common_poems = len(set(guide_df['poem_id']) & set(target_df['poem_id']))
    print(f"\nGuide poems: {len(guide_df)}")
    print(f"Target rows: {len(target_df)}")
    print(f"Common poem_ids: {common_poems}")
    
    # Build mapping: (poem_id, line_text) -> correct_line_order
    line_order_map = {}
    
    for _, guide_row in guide_df.iterrows():
        poem_id = guide_row['poem_id']
        full_poem = str(guide_row['Poem_line_cleaned'])
        lines = [l.strip() for l in full_poem.split('\n') if l.strip()]
        
        for line_num, line in enumerate(lines, start=1):
            key = (poem_id, line)
            line_order_map[key] = line_num
    
    print(f"Mapping entries created: {len(line_order_map)}")
    
    # Assign correct line order to each target row
    def get_order(row):
        # Handle missing/NaN values
        if pd.isna(row['Poem_line_cleaned']):
            return 999999
        
        key = (row['poem_id'], str(row['Poem_line_cleaned']).strip())
        return line_order_map.get(key, 999999)
    
    target_df['line_order'] = target_df.apply(get_order, axis=1)
    
    # Calculate matching statistics
    matched = (target_df['line_order'] != 999999).sum()
    unmatched = (target_df['line_order'] == 999999).sum()
    
    print(f"\n{'='*70}")
    print("MATCHING RESULTS")
    print(f"{'='*70}")
    print(f"✓ Matched: {matched}/{len(target_df)} ({100*matched/len(target_df):.1f}%)")
    print(f"✗ Unmatched: {unmatched}")
    
    if unmatched > 0:
        print(f"\nUnmatched rows will be sorted to the end")
        # Show some unmatched examples
        unmatched_samples = target_df[target_df['line_order'] == 999999].head(3)
        print("\nFirst 3 unmatched rows:")
        for _, row in unmatched_samples.iterrows():
            line_text = row['Poem_line_cleaned'] if pd.notna(row['Poem_line_cleaned']) else "[MISSING]"
            print(f"  poem_id={row['poem_id']}, Row_ID={row['Row_ID']}: {str(line_text)[:50]}...")
    
    # Show before/after for poem 1
    test_poem_id = 1
    if test_poem_id in target_df['poem_id'].values:
        print(f"\n{'='*70}")
        print(f"POEM {test_poem_id} - BEFORE (wrong order)")
        print(f"{'='*70}")
        
        # Get original order before sorting
        original_target = pd.read_csv(target_path)
        poem1_before = original_target[original_target['poem_id'] == test_poem_id]
        for _, row in poem1_before.iterrows():
            line = row['Poem_line_cleaned'] if pd.notna(row['Poem_line_cleaned']) else "[MISSING]"
            print(f"Row_ID {row['Row_ID']}: {str(line)[:55]}...")
    
    # Sort by poem_id first, then by correct line order
    target_df = target_df.sort_values(['poem_id', 'line_order']).reset_index(drop=True)
    
    # Renumber Row_ID globally from 1
    target_df['Row_ID'] = range(1, len(target_df) + 1)
    
    # Drop helper column
    target_df = target_df.drop('line_order', axis=1)
    
    # Save fixed CSV
    target_df.to_csv(output_path, index=False, encoding='utf-8')
    
    # Show after for poem 1
    if test_poem_id in target_df['poem_id'].values:
        print(f"\n{'='*70}")
        print(f"POEM {test_poem_id} - AFTER (correct order)")
        print(f"{'='*70}")
        poem1_after = target_df[target_df['poem_id'] == test_poem_id]
        for _, row in poem1_after.iterrows():
            line = row['Poem_line_cleaned'] if pd.notna(row['Poem_line_cleaned']) else "[MISSING]"
            print(f"Row_ID {row['Row_ID']}: {str(line)[:55]}...")
    
    print(f"\n{'='*70}")
    print(f"✓ SUCCESS! Fixed CSV saved to: {output_path}")
    print(f"{'='*70}\n")
    
    return target_df


# Run
guide_file = "./CSV/FAZ3_POEMS_Exact_Search - cleaned Full_poems.csv"
target_file = "./CSV/FAZ3_POEMS_Exact_Search - Exact_search.csv"
output_file = "fixed.csv"

result = fix_csv_order(guide_file, target_file, output_file)

ARABIC POETRY CSV LINE REORDERER

⚠ Dropping 1 guide rows with non-numeric poem_id
  These rows will be placed at the end

Guide poems: 352
Target rows: 4214
Common poem_ids: 351
Mapping entries created: 4189

MATCHING RESULTS
✓ Matched: 4180/4214 (99.2%)
✗ Unmatched: 34

Unmatched rows will be sorted to the end

First 3 unmatched rows:
  poem_id=2.0, Row_ID=15: اصعب سؤال ف حياه العاشق الشاقي      اللي طرحته علي...
  poem_id=14.0, Row_ID=172: السؤال اللي يراودني علي كل اتجاه      عند مثلك سلع...
  poem_id=68.0, Row_ID=967: والحين من شهرين من صار ماصار      وانا اتمني يا حب...

POEM 1 - BEFORE (wrong order)
Row_ID 1: اناني ف وصلك لكن يالاناني      تري الانانيه ف طبعك تخرب...
Row_ID 2: لحظات حبك حطمت لي كياني      والحزن من بين المحاني تسرب...
Row_ID 3: حبيبتي حالي من الشوق داني      اشرق بحبك .. واعود اغرب...
Row_ID 4: تسرب وصكت عليه المحاني      كنه لوصلك بالجروح يتقرب...
Row_ID 5: شربت حبك لين فاض الحناني      وقامت عروقي من غلاك تشرب...
Row_ID 6: يا معرب الانساب خدر العياني      سلم 