In [2]:
import pandas as pd
import json
from ast import literal_eval

def chunk_semantic_search_dataset(input_csv, output_csv):
    """
    Chunk semantic search dataset with:
    - 4 lines per chunk
    - 1-line overlap between chunks (step=3)
    - Final chunk always contains last 4 lines
    - Strict poem boundaries (no cross-poem chunks)
    - Preserves all original columns, aggregating per-line columns without duplicates using set-like logic
    - Handles short poems appropriately
    """
    df = pd.read_csv(input_csv, encoding='utf-8-sig')
    df = df.sort_values(['poem_id', 'Row_ID']).reset_index(drop=True)
    
    # Identify poem-level columns (assuming same value across poem)
    poem_level_cols = ['Title_raw', 'Title_cleaned', 'قافية', 'روي', 'البحر', 'وصل', 'حركة', 'Status', 'confidence', 'تصنيف']
    
    # Per-line aggregatable list columns (string reps of lists or JSON)
    list_cols = ['شخص', 'sentiments', 'أماكن', 'أحداث', 'مواضيع']
    
    output_rows = []
    chunk_id_counter = 1
    
    for poem_id, group in df.groupby('poem_id'):
        lines = group['Poem_line_cleaned'].tolist()
        raw_lines = group['Poem_line_raw'].tolist()
        summaries = group['summary'].tolist()
        row_ids = group['Row_ID'].tolist()
        full_poem = '\n'.join(lines)
        
        # Poem-level values
        poem_values = {col: group[col].iloc[0] if col in group.columns else '' for col in poem_level_cols}
        
        n_lines = len(lines)
        
        if n_lines < 4:
            start_indices = [0]
        else:
            regular_starts = list(range(0, n_lines - 3, 3))
            final_start = n_lines - 4
            start_indices = sorted(set(regular_starts + [final_start]))
        
        for start_idx in start_indices:
            end_idx = start_idx + min(4, n_lines - start_idx)
            chunk_group = group.iloc[start_idx:end_idx]
            
            # Aggregations
            chunk_text = '\n'.join(lines[start_idx:end_idx])
            chunk_raw_text = '\n'.join(raw_lines[start_idx:end_idx])
            chunk_summaries = summaries[start_idx:end_idx]
            summary_chunked = ' '.join(chunk_summaries) + "---" + chunk_text
            row_id_str = ','.join(map(str, row_ids[start_idx:end_idx]))
            chunk_type = 'short_poem' if n_lines < 4 else ('final' if start_idx == n_lines - 4 else 'regular')
            
            # Aggregate list columns without duplicates
            # Aggregate list columns without duplicates + readable Arabic
            agg_lists = {}
            for col in list_cols:
                if col in chunk_group.columns:
                    unique_items = set()
                    for val in chunk_group[col].dropna():
                        try:
                            items = json.loads(val)
                            for item in items:
                                if isinstance(item, dict):
                                    unique_items.add(json.dumps(item, sort_keys=True, ensure_ascii=False))
                                else:
                                    unique_items.add(str(item))
                        except:
                            unique_items.add(str(val))
                    # Reconstruct as list string with readable Arabic
                    reconstructed = []
                    for s in sorted(unique_items):
                        try:
                            reconstructed.append(json.loads(s))
                        except:
                            reconstructed.append(s)
                    if col == 'شخص':
                        agg_lists[col] = json.dumps(reconstructed, ensure_ascii=False)
                    else:
                        agg_lists[col] = str(sorted([item for item in reconstructed if not isinstance(item, dict)])) if reconstructed and isinstance(reconstructed[0], str) else str(reconstructed)
            
            # Build chunk row
            chunk_row = {
                'poem_id': poem_id,
                'chunk_id': chunk_id_counter,
                'Title_cleaned': poem_values['Title_cleaned'],
                'poem_lines_subset': chunk_text,
                'Summary_chunked': summary_chunked,
                'Title_raw': poem_values['Title_raw'],
                'Poem_line_raw': chunk_raw_text,
                'Full_poem': full_poem,
                'Row_IDs_in_chunk': row_id_str,
                'chunk_type': chunk_type,
                **poem_values,  # Include all poem-level
                **agg_lists,    # Include aggregated lists
            }
            
            # Add any other original columns not handled (take first if poem-level)
            other_cols = {col: chunk_group[col].iloc[0] if col in chunk_group.columns else '' 
                          for col in df.columns if col not in ['poem_id', 'Row_ID', 'Poem_line_raw', 'Poem_line_cleaned', 'summary'] + poem_level_cols + list_cols}
            chunk_row.update(other_cols)
            
            output_rows.append(chunk_row)
            chunk_id_counter += 1
    
    output_df = pd.DataFrame(output_rows)
    output_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    print(f"Successfully created {len(output_df)} chunks from {len(df)} original lines")
    print(f"Output saved to: {output_csv}")
    return output_df

# Direct execution
input_file = "06-chunking exact search dataset for semantic V2.csv"
output_file = "07-chunked for semantic V2.csv"

result = chunk_semantic_search_dataset(input_file, output_file)

Successfully created 1418 chunks from 4215 original lines
Output saved to: 07-chunked for semantic V2.csv
