In [None]:
import pandas as pd
import math
import ast
import json

def merge_metadata_values(series_list, col_name):
    """Merge values from multiple rows, deduplicating appropriately"""
    # Only exclude the source column that we're chunking from
    if col_name in ['Poem_line_cleaned']:
        return series_list.iloc[0] if not series_list.empty else ""
    
    # For JSON-like columns (person, sentiments, etc.)
    if col_name in ['person', 'sentiments', 'أماكن', 'أحداث', 'مواضيع']:
        all_values = []
        for val in series_list:
            if pd.notna(val) and val != "":
                try:
                    # Try parsing as JSON list/array
                    parsed = ast.literal_eval(str(val)) if isinstance(val, str) and (val.startswith('[') or val.startswith('{')) else val
                    if isinstance(parsed, list):
                        all_values.extend(parsed)
                    else:
                        all_values.append(val)
                except:
                    all_values.append(val)
        # Remove duplicates while preserving order
        unique_values = []
        seen = set()
        for item in all_values:
            item_str = json.dumps(item, sort_keys=True) if isinstance(item, dict) else str(item)
            if item_str not in seen:
                seen.add(item_str)
                unique_values.append(item)
        return unique_values
    
    # For simple text columns (summary, qafia, etc.), get unique values
    else:
        unique_values = list(set([str(v) for v in series_list if pd.notna(v) and v != ""]))
        # Return the most common non-empty value, or join if multiple
        if len(unique_values) == 1:
            return unique_values[0]
        else:
            # For columns like summary, take the first one; for others, join unique values
            if col_name == 'summary':
                return series_list.iloc[0] if not series_list.empty else ""
            else:
                return ", ".join([v for v in unique_values if v])

df = pd.read_csv("06-chunking exact search dataset for semantic.csv")  # Replace with your actual filename

required = ['poem_id', 'Row_ID', 'Title_cleaned', 'Poem_line_cleaned']
for col in required:
    if col not in df.columns:
        raise ValueError(f"Missing column: {col}")

df = df.sort_values(['poem_id', 'Row_ID']).reset_index(drop=True)
df['Poem_line_cleaned'] = df['Poem_line_cleaned'].fillna('').astype(str)

all_records = []
chunk_counter = 1

for poem_id, group in df.groupby('poem_id'):
    title = group['Title_cleaned'].dropna().iloc[0] if not group['Title_cleaned'].dropna().empty else "Untitled"
    poem_lines = [(row['Row_ID'], row['Poem_line_cleaned'], row) for _, row in group.iterrows()]
    n = len(poem_lines)
    
    if n == 0:
        # Handle empty poem case
        base_record = {
            'poem_id': poem_id,
            'chunk_id': chunk_counter,
            'Title_cleaned': title,
            'poem_lines_subset': "",
            'Poem_line_raw': "",
            'Title_raw': ""
        }
        chunk_counter += 1
        # Add all other columns with empty values
        for col in group.columns:
            if col not in ['poem_id', 'Row_ID', 'Title_cleaned', 'Poem_line_cleaned']:
                base_record[col] = ""
        all_records.append(base_record)
    else:
        full_raw = '\n'.join(row['Poem_line_cleaned'] for _, _, row in poem_lines)
        
        # Format chunk content as clean lines without any tags
        def format_chunk_content(lines_chunk):
            if not lines_chunk:
                return ""
            # Join lines with newlines, no tags or iterators
            return '\n'.join([line for _, line, _ in lines_chunk])

        # For poems with <= 4 lines, create just ONE chunk
        if n <= 4:
            chunk = poem_lines
            base_record = {
                'poem_id': poem_id,
                'chunk_id': chunk_counter,
                'Title_cleaned': title,
                'poem_lines_subset': format_chunk_content(chunk),
                'Poem_line_raw': "\n".join([row['Poem_line_raw'] for _, row in group.iterrows()]) if 'Poem_line_raw' in group.columns else "",
                'Title_raw': merge_metadata_values(group['Title_raw'], 'Title_raw') if 'Title_raw' in group.columns else ""
            }
            
            # Add ALL original columns
            for col in group.columns:
                if col not in ['poem_id', 'Row_ID', 'Title_cleaned', 'Poem_line_cleaned', 'Poem_line_raw', 'Title_raw']:
                    merged_value = merge_metadata_values(group[col], col)
                    base_record[col] = merged_value
            
            all_records.append(base_record)
            chunk_counter += 1
        else:
            # For poems with > 4 lines, use 4-line chunks with 2-line overlap
            i = 0
            while i < n:
                end_idx = min(i + 4, n)
                chunk = poem_lines[i:end_idx]
                
                # Get the corresponding group slice for metadata merging
                start_row_idx = group.index[i]
                end_row_idx = group.index[min(i + 4, n) - 1] if n > 0 else start_row_idx
                chunk_group = group.loc[start_row_idx:end_row_idx]
                
                base_record = {
                    'poem_id': poem_id,
                    'chunk_id': chunk_counter,
                    'Title_cleaned': title,
                    'poem_lines_subset': format_chunk_content(chunk),
                    'Poem_line_raw': "\n".join([row['Poem_line_raw'] for _, row in chunk_group.iterrows()]) if 'Poem_line_raw' in chunk_group.columns else "",
                    'Title_raw': merge_metadata_values(chunk_group['Title_raw'], 'Title_raw') if 'Title_raw' in chunk_group.columns else ""
                }
                
                # Add ALL original columns
                for col in group.columns:
                    if col not in ['poem_id', 'Row_ID', 'Title_cleaned', 'Poem_line_cleaned', 'Poem_line_raw', 'Title_raw']:
                        merged_value = merge_metadata_values(chunk_group[col], col)
                        base_record[col] = merged_value
                
                all_records.append(base_record)
                chunk_counter += 1
                
                # Move by 2 positions for 2-line overlap (4 - 2 = 2)
                i += 2

# Convert to DataFrame
output_df = pd.DataFrame(all_records)

# Build full poems for each poem_id by concatenating all original chunks
poem_full_map = {}
for poem_id, group in output_df.groupby('poem_id'):
    # Join all chunks for this poem_id in order (sorted by chunk_id)
    ordered_chunks = group.sort_values('chunk_id')['poem_lines_subset'].tolist()
    full_poem = '\n'.join(ordered_chunks)
    poem_full_map[poem_id] = full_poem

# Add the Full_poem column to the dataframe
output_df['Full_poem'] = output_df['poem_id'].map(poem_full_map)

# Reorder columns to have poem_id, chunk_id first, then Title_cleaned, poem_lines_subset, Full_poem, then others
column_order = ['poem_id', 'chunk_id', 'Title_cleaned', 'poem_lines_subset', 'Full_poem']
remaining_cols = [col for col in output_df.columns if col not in column_order]
final_column_order = column_order + remaining_cols

output_df = output_df[final_column_order]

# Save to output CSV
output_df.to_csv("07-chunked for semantic.csv", index=False, encoding='utf-8-sig')

print(f"✅ Done! {len(output_df)} rows generated.")
print("Columns in output:", list(output_df.columns))

✅ Done! 2170 rows generated.
Columns in output: ['poem_id', 'chunk_id', 'Title_cleaned', 'poem_lines_subset', 'Full_poem', 'Poem_line_raw', 'Title_raw', 'summary', 'قافية', 'البحر', 'وصل', 'حركة', 'نوع', 'شخص', 'sentiments', 'أماكن', 'أحداث', 'مواضيع', 'تصنيف']
