In [1]:
import pandas as pd

def chunk_poems_with_overlap(input_csv, output_csv):
    """
    Chunk poems with:
    - 4 lines per chunk
    - 1-line overlap between chunks (step=3)
    - Final chunk always contains last 4 lines
    - Strict poem boundaries (no cross-poem chunks)
    - Preserves all original columns
    - Handles short poems appropriately
    """
    # Read input CSV
    df = pd.read_csv(input_csv)
    
    # Sort by poem_id and Row_ID to ensure correct ordering
    df = df.sort_values(['poem_id', 'Row_ID']).reset_index(drop=True)
    
    output_rows = []
    chunk_id_counter = 1
    
    # Process each poem separately
    for poem_id, group in df.groupby('poem_id'):
        lines = group['Poem_line_cleaned'].tolist()
        raw_lines = group['Poem_line_raw'].tolist()
        row_ids = group['Row_ID'].tolist()
        title_raw = group['Title_raw'].iloc[0]  # Same for entire poem
        title_cleaned = group['Title_cleaned'].iloc[0]  # Same for entire poem
        other_cols = {col: group[col].iloc[0] for col in df.columns 
                     if col not in ['Row_ID', 'Poem_line_raw', 'Poem_line_cleaned']}
        
        n_lines = len(lines)
        
        # Case 1: Poem has fewer than 4 lines - create single chunk
        if n_lines < 4:
            chunk_text = '\n'.join(lines)
            chunk_raw_text = '\n'.join(raw_lines)
            row_id_str = ','.join(map(str, row_ids))
            output_rows.append({
                'poem_id': poem_id,
                'chunk_id': chunk_id_counter,
                'Title_raw': title_raw,
                'Title_cleaned': title_cleaned,
                'Poem_line_raw': chunk_raw_text,
                'Poem_line_cleaned': chunk_text,
                'Row_IDs_in_chunk': row_id_str,
                'chunk_type': 'short_poem',
                **other_cols  # Include all other original columns
            })
            chunk_id_counter += 1
            continue
        
        # Case 2: Poem has 4+ lines
        regular_starts = []
        current_start = 0
        
        # Generate regular chunks with step=3 (1-line overlap)
        while current_start <= n_lines - 4:
            regular_starts.append(current_start)
            current_start += 3
        
        # Determine final chunk start position (must end at last line)
        final_start = n_lines - 4  # Always starts at last_line-3
        
        # Combine regular starts with final chunk start if needed
        all_starts = sorted(set(regular_starts + [final_start]))
        
        # Create chunks
        for start_idx in all_starts:
            end_idx = start_idx + 4
            # Safety check - should always have 4 lines here
            if end_idx > n_lines:
                continue
                
            chunk_lines = lines[start_idx:end_idx]
            chunk_raw_lines = raw_lines[start_idx:end_idx]
            chunk_row_ids = row_ids[start_idx:end_idx]
            
            chunk_text = '\n'.join(chunk_lines)
            chunk_raw_text = '\n'.join(chunk_raw_lines)
            row_id_str = ','.join(map(str, chunk_row_ids))
            chunk_type = 'final' if start_idx == final_start and start_idx not in regular_starts else 'regular'
            
            output_rows.append({
                'poem_id': poem_id,
                'chunk_id': chunk_id_counter,
                'Title_raw': title_raw,
                'Title_cleaned': title_cleaned,
                'Poem_line_raw': chunk_raw_text,
                'Poem_line_cleaned': chunk_text,
                'Row_IDs_in_chunk': row_id_str,
                'chunk_type': chunk_type,
                **other_cols  # Include all other original columns
            })
            chunk_id_counter += 1
    
    # Create output DataFrame
    output_df = pd.DataFrame(output_rows)
    
    # Save to CSV with UTF-8 encoding for Arabic text
    output_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    print(f"Successfully created {len(output_df)} chunks")
    print(f"Output saved to: {output_csv}")
    return output_df

# Direct execution
input_file = "FAZ_POEMS_REVIEW - FAZ_POEMS_Lines_Cleaned.csv"
output_file = "poems_chunked.csv"

result = chunk_poems_with_overlap(input_file, output_file)

Successfully created 1418 chunks
Output saved to: poems_chunked.csv
