In [12]:
import pandas as pd
import re
import json

def parse_entity_field(field_str):
    if pd.isna(field_str) or not isinstance(field_str, str) or not field_str.strip():
        return []
    clean = re.sub(r'^شخص\s*\n?', '', field_str.strip())
    if not clean:
        return []

    segments = re.split(r'\]\s*,\s*\[', clean)
    records = []
    for seg in segments:
        seg = seg.strip('[] \n')
        if not seg:
            continue
        name = re.search(r'name:([^,]+)', seg)
        rel = re.search(r'relation:([^,]+)', seg)
        res = re.search(r'resolved_from:(.+)', seg)
        if name and rel and res:
            resolved_list = [x.strip() for x in res.group(1).split(',') if x.strip()]
            records.append({
                "name": name.group(1).strip(),
                "relation": rel.group(1).strip(),
                "resolved_from": resolved_list
            })
    return records

def parse_places_field(field_str):
    if pd.isna(field_str) or not isinstance(field_str, str) or not field_str.strip():
        return []
    
    parts = re.split(r'name:', field_str)
    records = []
    for part in parts:
        if not part.strip():
            continue
        name_type = part.split('type:', 1)
        if len(name_type) == 2:
            name = name_type[0].strip()
            type_val = name_type[1].strip()
            records.append({
                "name": name,
                "type": type_val
            })
    return records

def parse_topics_field(field_str):
    if pd.isna(field_str) or not isinstance(field_str, str) or not field_str.strip():
        return []
    topics = [topic.strip() for topic in field_str.split(',') if topic.strip()]
    return topics  # Returns list for JSON serialization

# Load CSV
df = pd.read_csv("06 - FAZ3_POEMS_Exact_Search - Exact_search.csv")

# Parse all three columns
df['parsed_entities'] = df['شخص'].apply(lambda x: json.dumps(parse_entity_field(x), ensure_ascii=False))
df['parsed_places'] = df['أماكن'].apply(lambda x: json.dumps(parse_places_field(x), ensure_ascii=False))
df['parsed_topics'] = df['مواضيع'].apply(lambda x: json.dumps(parse_topics_field(x), ensure_ascii=False))

# Replace nulls in new columns with '[]'
df['parsed_entities'] = df['parsed_entities'].apply(lambda x: '[]' if pd.isna(x) else x)
df['parsed_places'] = df['parsed_places'].apply(lambda x: '[]' if pd.isna(x) else x)
df['parsed_topics'] = df['parsed_topics'].apply(lambda x: '[]' if pd.isna(x) else x)

# Fill all object columns with '[]' for nulls
object_cols = df.select_dtypes(include=['object']).columns
df[object_cols] = df[object_cols].fillna('[]')

# Save
df.to_csv("06 - Exact_search.csv", index=False, encoding='utf-8')

In [15]:
import pandas as pd
import json
from ast import literal_eval

def chunk_semantic_search_dataset(input_csv, output_csv):
    """
    Chunk semantic search dataset with:
    - 4 lines per chunk
    - 1-line overlap between chunks (step=3)
    - Final chunk always contains last 4 lines
    - Strict poem boundaries (no cross-poem chunks)
    - Preserves all original columns, aggregating per-line columns without duplicates using set-like logic
    - Handles short poems appropriately
    """
    df = pd.read_csv(input_csv, encoding='utf-8-sig')
    df = df.sort_values(['poem_id', 'Row_ID']).reset_index(drop=True)
    
    # Identify poem-level columns (assuming same value across poem)
    poem_level_cols = ['Title_raw', 'Title_cleaned', 'قافية', 'روي', 'البحر', 'وصل', 'حركة', 'شخص', 'تصنيف', 'مواضيع']
    
    # Per-line aggregatable list columns (string reps of lists or JSON)
    list_cols = ['sentiments', 'parsed_places', 'parsed_topics', 'دين', 'أحداث', 'أماكن', 'parsed_entities']
    
    output_rows = []
    chunk_id_counter = 1
    
    for poem_id, group in df.groupby('poem_id'):
        lines = group['Poem_line_cleaned'].tolist()
        raw_lines = group['Poem_line_raw'].tolist()
        summaries = group['summary'].tolist()
        row_ids = group['Row_ID'].tolist()
        full_poem = '\n'.join(lines)
        
        # Poem-level values
        poem_values = {col: group[col].iloc[0] if col in group.columns else '' for col in poem_level_cols}
        
        n_lines = len(lines)
        
        if n_lines < 4:
            start_indices = [0]
        else:
            regular_starts = list(range(0, n_lines - 3, 3))
            final_start = n_lines - 4
            start_indices = sorted(set(regular_starts + [final_start]))
        
        for start_idx in start_indices:
            end_idx = start_idx + min(4, n_lines - start_idx)
            chunk_group = group.iloc[start_idx:end_idx]
            
            # Aggregations
            chunk_text = '\n'.join(lines[start_idx:end_idx])
            chunk_raw_text = '\n'.join(raw_lines[start_idx:end_idx])
            chunk_summaries = summaries[start_idx:end_idx]
            summary_chunked = ' '.join(chunk_summaries) + "---" + chunk_text
            row_id_str = ','.join(map(str, row_ids[start_idx:end_idx]))
            chunk_type = 'short_poem' if n_lines < 4 else ('final' if start_idx == n_lines - 4 else 'regular')
            
            # Aggregate list columns without duplicates
            agg_lists = {}
            for col in list_cols:
                if col in chunk_group.columns:
                    unique_items = set()
                    for val in chunk_group[col].dropna():
                        try:
                            items = json.loads(val)
                            for item in items:
                                if isinstance(item, dict):
                                    unique_items.add(json.dumps(item, sort_keys=True, ensure_ascii=False))
                                else:
                                    unique_items.add(str(item))
                        except:
                            # If not JSON, treat as string and try to parse as list if possible
                            if val.strip().startswith('[') and val.strip().endswith(']'):
                                try:
                                    items = literal_eval(val)
                                    for item in items:
                                        if isinstance(item, dict):
                                            unique_items.add(json.dumps(item, sort_keys=True, ensure_ascii=False))
                                        else:
                                            unique_items.add(str(item))
                                except:
                                    unique_items.add(str(val))
                            else:
                                unique_items.add(str(val))
                    
                    # Reconstruct as list string with readable Arabic
                    reconstructed = []
                    for s in sorted(unique_items):
                        try:
                            reconstructed.append(json.loads(s))
                        except:
                            reconstructed.append(s)
                    
                    # Store as JSON string for consistency
                    agg_lists[col] = json.dumps(reconstructed, ensure_ascii=False)
            
            # Build chunk row
            chunk_row = {
                'poem_id': poem_id,
                'chunk_id': chunk_id_counter,
                'Title_cleaned': poem_values['Title_cleaned'],
                'poem_lines_subset': chunk_text,
                'Summary_chunked': summary_chunked,
                'Title_raw': poem_values['Title_raw'],
                'Poem_line_raw': chunk_raw_text,
                'Full_poem': full_poem,
                'Row_IDs_in_chunk': row_id_str,
                'chunk_type': chunk_type,
                **poem_values,  # Include all poem-level
                **agg_lists,    # Include aggregated lists
            }
            
            # Add ALL remaining original columns (not in handled lists) - take first value for consistency
            all_original_cols = df.columns.tolist()
            handled_cols = ['poem_id', 'Row_ID', 'Poem_line_raw', 'Poem_line_cleaned', 'summary'] + poem_level_cols + list_cols
            for col in all_original_cols:
                if col not in handled_cols:
                    chunk_row[col] = chunk_group[col].iloc[0] if col in chunk_group.columns and len(chunk_group[col].dropna()) > 0 else ''
            
            output_rows.append(chunk_row)
            chunk_id_counter += 1
    
    output_df = pd.DataFrame(output_rows)
    output_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    print(f"Successfully created {len(output_df)} chunks from {len(df)} original lines")
    print(f"Output saved to: {output_csv}")
    return output_df

# Direct execution
input_file = "06 - Exact_search.csv"  # Use your processed CSV file
output_file = "07-chunked for semantic.csv"

result = chunk_semantic_search_dataset(input_file, output_file)

Successfully created 1417 chunks from 4213 original lines
Output saved to: 07-chunked for semantic.csv


In [3]:
import pandas as pd
import json

def merge_person_entities(escaped_json_str):
    """Merge person entities by name/relation, combining resolved_from lists"""
    if pd.isna(escaped_json_str) or escaped_json_str == '':
        return escaped_json_str
    
    try:
        # Handle escaped JSON string by unescaping quotes first
        if escaped_json_str.startswith('"') and escaped_json_str.endswith('"'):
            # Remove outer quotes and unescape inner quotes
            json_str = escaped_json_str[1:-1].replace('\\"', '"')
        else:
            json_str = escaped_json_str
            
        items = json.loads(json_str)
        merged_dict = {}
        
        for item in items:
            name = item['name']
            relation = item['relation']
            resolved_from = item['resolved_from']
            
            key = (name, relation)
            if key in merged_dict:
                # Combine resolved_from lists, avoiding duplicates while preserving order
                existing_resolved = merged_dict[key]
                for res in resolved_from:
                    if res not in existing_resolved:
                        existing_resolved.append(res)
            else:
                merged_dict[key] = resolved_from[:]
        
        # Create the merged list
        merged_list = []
        for (name, relation), resolved_from in merged_dict.items():
            merged_list.append({
                'name': name,
                'relation': relation,
                'resolved_from': resolved_from
            })
        
        return json.dumps(merged_list, ensure_ascii=False)
    except:
        return escaped_json_str

# Load the CSV
df = pd.read_csv("07-chunked for semantic.csv", encoding='utf-8-sig')

# Apply merging to the 'شخص' column
if 'شخص' in df.columns:
    df['شخص'] = df['شخص'].apply(merge_person_entities)

# Save the updated CSV
df.to_csv("07- 02 -chunked for semantic.csv", index=False, encoding='utf-8-sig')
print(f"Processed {len(df)} rows, merged person entities in 'شخص' column")

Processed 1417 rows, merged person entities in 'شخص' column
