In [4]:
import pandas as pd

def chunk_semantic_search_dataset(input_csv, output_csv):
    """
    Chunk semantic search dataset with:
    - 4 lines per chunk
    - 1-line overlap between chunks (step=3)
    - Final chunk always contains last 4 lines
    - Strict poem boundaries (no cross-poem chunks)
    - Preserves all original columns including Arabic metadata
    - Handles short poems appropriately
    """
    # Read input CSV
    df = pd.read_csv(input_csv, encoding='utf-8-sig')
    
    # Sort by poem_id and Row_ID to ensure correct ordering
    df = df.sort_values(['poem_id', 'Row_ID']).reset_index(drop=True)
    
    output_rows = []
    chunk_id_counter = 1
    
    # Process each poem separately
    for poem_id, group in df.groupby('poem_id'):
        lines = group['Poem_line_cleaned'].tolist()
        raw_lines = group['Poem_line_raw'].tolist()
        row_ids = group['Row_ID'].tolist()
        title_raw = group['Title_raw'].iloc[0]  # Same for entire poem
        title_cleaned = group['Title_cleaned'].iloc[0]  # Same for entire poem
        
        # Get first value for summary and other repeated columns
        summary = group['summary'].iloc[0] if 'summary' in group.columns else ''
        
        # Store other Arabic metadata columns
        qafia = group['قافية'].iloc[0] if 'قافية' in group.columns else ''
        al_bahr = group['البحر'].iloc[0] if 'البحر' in group.columns else ''
        wasl = group['وصل'].iloc[0] if 'وصل' in group.columns else ''
        haraka = group['حركة'].iloc[0] if 'حركة' in group.columns else ''
        noa3 = group['نوع'].iloc[0] if 'نوع' in group.columns else ''
        shakhs = group['شخص'].iloc[0] if 'شخص' in group.columns else ''
        sentiments = group['sentiments'].iloc[0] if 'sentiments' in group.columns else ''
        amakin = group['أماكن'].iloc[0] if 'أماكن' in group.columns else ''
        a7dath = group['أحداث'].iloc[0] if 'أحداث' in group.columns else ''
        mawadi3 = group['مواضيع'].iloc[0] if 'مواضيع' in group.columns else ''
        tasnif = group['تصنيف'].iloc[0] if 'تصنيف' in group.columns else ''
        
        # Get all other original columns (excluding the ones we've already handled)
        other_cols = {col: group[col].iloc[0] for col in df.columns 
                     if col not in ['Row_ID', 'Poem_line_raw', 'Poem_line_cleaned', 
                                   'Title_raw', 'Title_cleaned', 'summary', 'قافية', 
                                   'البحر', 'وصل', 'حركة', 'نوع', 'شخص', 'sentiments', 
                                   'أماكن', 'أحداث', 'مواضيع', 'تصنيف']}
        
        n_lines = len(lines)
        
        # Case 1: Poem has fewer than 4 lines - create single chunk
        if n_lines < 4:
            chunk_text = '\n'.join(lines)
            chunk_raw_text = '\n'.join(raw_lines)
            row_id_str = ','.join(map(str, row_ids))
            
            chunk_row = {
                'poem_id': poem_id,
                'chunk_id': chunk_id_counter,
                'Title_raw': title_raw,
                'Title_cleaned': title_cleaned,
                'Poem_line_raw': chunk_raw_text,
                'Poem_line_cleaned': chunk_text,
                'Row_IDs_in_chunk': row_id_str,
                'chunk_type': 'short_poem',
                'summary': summary,
                'قافية': qafia,
                'البحر': al_bahr,
                'وصل': wasl,
                'حركة': haraka,
                'نوع': noa3,
                'شخص': shakhs,
                'sentiments': sentiments,
                'أماكن': amakin,
                'أحداث': a7dath,
                'مواضيع': mawadi3,
                'تصنيف': tasnif,
                **other_cols  # Include all other original columns
            }
            output_rows.append(chunk_row)
            chunk_id_counter += 1
            continue
        
        # Case 2: Poem has 4+ lines
        regular_starts = []
        current_start = 0
        
        # Generate regular chunks with step=3 (1-line overlap)
        while current_start <= n_lines - 4:
            regular_starts.append(current_start)
            current_start += 3
        
        # Determine final chunk start position (must end at last line)
        final_start = n_lines - 4  # Always starts at last_line-3
        
        # Combine regular starts with final chunk start if needed
        all_starts = sorted(set(regular_starts + [final_start]))
        
        # Create chunks
        for start_idx in all_starts:
            end_idx = start_idx + 4
            # Safety check - should always have 4 lines here
            if end_idx > n_lines:
                continue
                
            chunk_lines = lines[start_idx:end_idx]
            chunk_raw_lines = raw_lines[start_idx:end_idx]
            chunk_row_ids = row_ids[start_idx:end_idx]
            
            chunk_text = '\n'.join(chunk_lines)
            chunk_raw_text = '\n'.join(chunk_raw_lines)
            row_id_str = ','.join(map(str, chunk_row_ids))
            chunk_type = 'final' if start_idx == final_start and start_idx not in regular_starts else 'regular'
            
            chunk_row = {
                'poem_id': poem_id,
                'chunk_id': chunk_id_counter,
                'Title_raw': title_raw,
                'Title_cleaned': title_cleaned,
                'Poem_line_raw': chunk_raw_text,
                'Poem_line_cleaned': chunk_text,
                'Row_IDs_in_chunk': row_id_str,
                'chunk_type': chunk_type,
                'summary': summary,
                'قافية': qafia,
                'البحر': al_bahr,
                'وصل': wasl,
                'حركة': haraka,
                'نوع': noa3,
                'شخص': shakhs,
                'sentiments': sentiments,
                'أماكن': amakin,
                'أحداث': a7dath,
                'مواضيع': mawadi3,
                'تصنيف': tasnif,
                **other_cols  # Include all other original columns
            }
            output_rows.append(chunk_row)
            chunk_id_counter += 1
    
    # Create output DataFrame
    output_df = pd.DataFrame(output_rows)
    
    # Save to CSV with UTF-8 encoding for Arabic text
    output_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    print(f"Successfully created {len(output_df)} chunks from {len(df)} original lines")
    print(f"Output saved to: {output_csv}")
    return output_df

# Direct execution
input_file = "06-chunking exact search dataset for semantic.csv"
output_file = "07-chunked for semantic.csv"

result = chunk_semantic_search_dataset(input_file, output_file)

Successfully created 1418 chunks from 4215 original lines
Output saved to: 07-chunked for semantic.csv
