In [None]:
import pandas as pd
import json
from collections import Counter

# --- 1. CONFIGURATION: YOUR DICTIONARIES ---
topic_keywords = {
    "الحب والغزل": ["حب", "غرام", "عشق", "يهواك", "أحبك", "مغرم", "ولهان", "خل", "حبيبي", "الغلا"],
    "الشوق والحنين": ["شوق", "حنين", "اشتياق", "مشتاق", "وينك", "غياب", "فقدتك", "يا طير", "ابطيت"],
    "الفراق والهجر": ["فراق", "هجر", "وداع", "رحيل", "صد", "جفى", "فرقا", "توديع", "غيبتك"],
    "الألم والحزن": ["حزن", "ألم", "جرح", "دمع", "بكى", "هم", "ضيق", "وجع", "مصاب", "عذاب"],
    "الوفاء والصبر": ["وفاء", "صبر", "عهد", "وعد", "صابر", "مخلص", "وافي", "تضحية"],
    "القيادة والزعماء": ["شيخ", "حاكم", "زايد", "محمد", "بوراشد", "بوخالد", "فزاع", "مجد", "شيوخ", "قائد"],
    "المجد والعز": ["عز", "مجد", "فخر", "ناموس", "طناخة", "هيبة", "شموخ", "عالي", "قمة"],
    "الشعر والإبداع": ["شعر", "قصيد", "بيوت", "قوافي", "أبيات", "لحن", "وزن", "معاني", "بوح"],
    "حب الوطن": ["وطن", "إمارات", "دبي", "بلادي", "داري", "أرض", "علم", "اتحاد"],
    "الطبيعة والخيل": ["خيل", "مهر", "صحراء", "بر", "مطر", "غيم", "بحر", "طير", "صيد", "مقناص"],
    "الإيمان والدعاء": ["الله", "رب", "دعاء", "صلاة", "دين", "مؤمن", "حمد", "شكر", "استغفر"],
    "التراث النبطي": ["هجن", "ذلول", "ناقة", "بدو", "شداد", "مركاض", "عزبة", "مقياظ"],
    "الغيرة والعتاب": ["عتاب", "ليش", "ليه", "تلومني", "زعل", "خطا", "مسامح", "غلطان"],
    "الذكريات": ["ذكرى", "أيام", "زمان", "ماضي", "تذكرت", "سنين", "طفولة"],
    "الفخر والشجاعة": ["شجاعة", "إقدام", "سيف", "خوي", "رفيق", "نشاما", "كفو"]
}

sentiment_keywords = {
    "حب": ["حب", "أحبك", "غلا", "عشق"],
    "شوق": ["شوق", "وله", "مشتاق", "حنين"],
    "حزن": ["حزن", "دمع", "بكاء", "ضيق"],
    "فخر": ["فخر", "عز", "مجد", "كفو"],
    "غضب": ["غضب", "زعل", "غيظ", "لوم"],
    "خوف": ["خوف", "رعب", "وجل"],
    "أمل": ["أمل", "تفاؤل", "بكره", "مستقبل"],
    "ندم": ["ندم", "ليت", "حسافة", "توبة"],
    "صبر": ["صبر", "تحمل", "جلد"],
    "فرح": ["فرح", "سعادة", "سرور", "عيد"],
    "حكمة": ["حكمة", "نصيحة", "تجارب", "عقل"]
}

place_keywords = {
    "أراضي_إماراتية": ["دبي", "أبوظبي", "الشارقة", "عجمان", "أم القيوين", "رأس الخيمة", "الفجيرة", "العين", "جميرا", "زعبيل", "مرموم", "ليوا"],
    "معالم_إماراتية": ["برج خليفة", "متحف", "قصر", "حصن", "مطار"],
    "مواقع_دينية": ["مكة", "مدينة", "طيبة", "مسجد", "حرم", "كعبة", "عرفات"],
    "مدن_وأماكن_عربية": ["رياض", "كويت", "بحرين", "عمان", "قطر", "قاهرة", "بغداد", "شام", "بيروت"],
    "طبيعة_عامة": ["بحر", "جبل", "وادي", "نهر", "غابة", "صحراء", "روضة"],
    "مواقع_نبطية": ["عد", "غدير", "رجم", "بيداء", "مراح", "مقيظ"],
    "أماكن_مجردة": ["خيال", "حلم", "ذاكرة", "قلب", "عين", "بال"],
    "قصر زعبيل": ["زعبيل"]
}

# --- 2. HELPERS ---
def find_col(df, candidates):
    """ Finds column case-insensitively """
    cols = [c.lower() for c in df.columns]
    for cand in candidates:
        if cand.lower() in cols:
            # Return actual column name from df
            return df.columns[cols.index(cand.lower())]
    return None

def parse_list(x):
    try:
        if pd.isna(x): return []
        s = str(x).replace("'", '"')
        if s.strip() == '[]' or s.strip() == '': return []
        return json.loads(s)
    except:
        return []

def calculate_priority(text, existing_tags, keyword_map, max_count=3):
    if not isinstance(existing_tags, list) or not existing_tags: 
        return []
    if not text or pd.isna(text): 
        return existing_tags[:max_count]
    
    scores = Counter()
    
    # 1. Base points (AI Detection)
    for tag in existing_tags:
        scores[tag] += 2
        
    # 2. Density points (Text Scan)
    text_clean = str(text).replace('\n', ' ')
    for category, keywords in keyword_map.items():
        if category in existing_tags:
            for word in keywords:
                if word in text_clean:
                    scores[category] += 1
    
    # 3. Sort & Slice
    sorted_tags = [tag for tag, score in scores.most_common() if score > 0]
    return sorted_tags[:max_count]

# --- 3. MAIN LOGIC ---
def run_final_polish(file_path):
    print(f"Reading {file_path}...")
    df = pd.read_csv(file_path)
    
    # Auto-detect Columns
    text_col = find_col(df, ['full_text', 'full_poem_text', 'Poem_line_cleaned', 'poem_line_cleaned'])
    topic_col = find_col(df, ['topics', 'mowadee'])
    mood_col = find_col(df, ['mood', 'sentiments', 'sentiment'])
    place_col = find_col(df, ['places', 'place_types', 'amaken'])
    
    if not text_col:
        print(f"❌ Error: Could not find Text column. Available: {list(df.columns)}")
        return

    print(f"✅ Found Columns: Text='{text_col}', Topics='{topic_col}', Mood='{mood_col}', Places='{place_col}'")
    print("Running Smart Priority Analysis...")
    
    new_topics = []
    new_moods = []
    new_places = []
    
    for idx, row in df.iterrows():
        text = str(row[text_col])
        
        # Topics
        curr_t = parse_list(row[topic_col]) if topic_col else []
        new_topics.append(json.dumps(calculate_priority(text, curr_t, topic_keywords, 3), ensure_ascii=False))
        
        # Mood
        curr_m = parse_list(row[mood_col]) if mood_col else []
        new_moods.append(json.dumps(calculate_priority(text, curr_m, sentiment_keywords, 2), ensure_ascii=False))
        
        # Places
        curr_p = parse_list(row[place_col]) if place_col else []
        new_places.append(json.dumps(calculate_priority(text, curr_p, place_keywords, 2), ensure_ascii=False))

    # Update DF
    if topic_col: df[topic_col] = new_topics
    if mood_col: df[mood_col] = new_moods
    if place_col: df[place_col] = new_places
    
    output_file = 'Poetry_App_Database_FINAL.csv'
    df.to_csv(output_file, index=False)
    
    print("\n" + "="*60)
    print("✨ DONE. Database is now Optimized & Clean.")
    print("="*60)
    print(f"Saved to: {output_file}")

# Run
run_final_polish('Poetry_App_Database.csv')

Reading Poetry_App_Database.csv...
❌ Error: Could not find Text column. Available: ['poem_id', 'entities', 'places', 'topics', 'religion', 'mood', 'category']


In [12]:
import pandas as pd
import json
import ast

# --- 1. APPROVED SCHEMA (The Filter) ---
allowed_topics = [
    "الحب والغزل", "الألم والحزن", "الوفاء والصبر", "الشوق والحنين", 
    "القيادة والزعماء", "المجد والعز", "الشعر والإبداع", "الطموح والنجاح", 
    "الفراق والهجر", "الجمال والعيون", "حب الوطن", "الغيرة والعتاب", 
    "الطبيعة والخيل", "الإيمان والدعاء", "التراث النبطي", "الزمن والقدر", 
    "الصداقة والكرم", "الفخر والشجاعة", "الإمارات ودبي", "الذكريات"
]

allowed_places = [
    "أراضي_إماراتية", "أماكن_مجردة", "طبيعة_عامة", "مواقع_نبطية", 
    "مدن_وأماكن_عربية", "مواقع_اجتماعية", "مواقع_دينية", "معالم_إماراتية", 
    "قصر زعبيل"
]

allowed_moods = [
    "حب", "فخر", "حزن", "شوق", "حكمة", "أمل", "غضب", 
    "صبر", "فرح", "حنين", "خوف", "ندم", "تأمل", "غيرة"
]

# --- 2. SYNONYM MAPPING (Consolidating Duplicates) ---
entity_map = {
    "حبيبتي": "الحبيب",
    "الحبيبة": "الحبيب",
    "المحبوبة": "الحبيب",
    "المحبوب": "الحبيب",
    "المتحدث": "الذات",
    "الشاعر": "الذات",
    "self": "الذات"
}

# --- 3. PARSING LOGIC ---
def safe_parse(cell):
    """Safely converts stringified lists/JSON into a Python list."""
    if pd.isna(cell) or str(cell).strip() == '': return []
    try:
        # Handle double encoding if present
        s = str(cell).replace("'", '"')
        if s.startswith('"') and s.endswith('"'): s = s[1:-1]
        return json.loads(s)
    except:
        try: return ast.literal_eval(str(cell))
        except: return []

# --- 4. CLEANING FUNCTIONS ---
def clean_entities(cell):
    """Parses entities, merges synonyms, removes duplicates."""
    items = safe_parse(cell)
    cleaned = []
    for x in items:
        # Clean string
        val = str(x).strip()
        # Apply Map (e.g., 'حبيبتي' -> 'الحبيب')
        val = entity_map.get(val, val)
        cleaned.append(val)
    
    # Remove duplicates while preserving order
    return list(dict.fromkeys(cleaned))

def clean_generic(cell, allowed_list, max_items=4):
    """Filters against approved list and limits count."""
    items = safe_parse(cell)
    cleaned = []
    for x in items:
        val = str(x).strip()
        if val in allowed_list:
            cleaned.append(val)
    
    # Remove duplicates & Limit
    unique = list(dict.fromkeys(cleaned))
    return unique[:max_items]

# --- 5. MAIN EXECUTION ---
def run_column_cleaner(file_path):
    print(f"Reading {file_path}...")
    df = pd.read_csv(file_path)
    
    # Verify columns exist
    required_cols = ['entities', 'places', 'topics', 'mood']
    for col in required_cols:
        if col not in df.columns:
            print(f"❌ Error: Column '{col}' not found. Available: {list(df.columns)}")
            return

    print("Cleaning Columns based on Schema...")

    # 1. Clean Entities (Merge synonyms)
    df['entities'] = df['entities'].apply(clean_entities)

    # 2. Clean Places (Filter & Limit)
    df['places'] = df['places'].apply(lambda x: clean_generic(x, allowed_places, 2))

    # 3. Clean Topics (Filter & Limit)
    df['topics'] = df['topics'].apply(lambda x: clean_generic(x, allowed_topics, 3))

    # 4. Clean Mood (Filter & Limit)
    df['mood'] = df['mood'].apply(lambda x: clean_generic(x, allowed_moods, 3))

    # Formatting as JSON strings for CSV storage
    for col in ['entities', 'places', 'topics', 'mood', 'religion', 'category']:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: json.dumps(x, ensure_ascii=False) if isinstance(x, list) else x)

    output_file = 'Poetry_App_Database_FINAL_CLEAN.csv'
    df.to_csv(output_file, index=False)
    
    print("\n" + "="*50)
    print("✅ DONE. Columns Cleaned & Validated.")
    print("="*50)
    print(f"Saved to: {output_file}")
    print("\nSample Row 1:")
    print(df.iloc[0])

# Run on your specific file
run_column_cleaner('Poetry_App_Database.csv')

Reading Poetry_App_Database.csv...
Cleaning Columns based on Schema...

✅ DONE. Columns Cleaned & Validated.
Saved to: Poetry_App_Database_FINAL_CLEAN.csv

Sample Row 1:
poem_id                                                     1
entities                                           ["الحبيب"]
places                                                     []
topics      ["الحب والغزل", "الغيرة والعتاب", "التراث النب...
religion                                                   []
mood                                    ["غضب", "فخر", "حزن"]
category                                            ['معاصر']
Name: 0, dtype: object


In [5]:
import pandas as pd
import re

# Define Arabic connecting letters (can connect to following letter)
CONNECTING_LETTERS = set('بتثجحخسشصضطظعغفقكلمنهي')
# Non-connecting letters (cannot connect to following letter)
NON_CONNECTING_LETTERS = set('ادذرزow')

def format_poem_for_visual_balance(poem_text):
    lines = poem_text.strip().split('\n')
    
    # Extract halves and store original structure
    all_halves = []
    pairs = []
    
    for line in lines:
        if '     ' in line:
            left, right = line.split('     ', 1)
            all_halves.append(left.strip())
            all_halves.append(right.strip())
            pairs.append([left.strip(), right.strip()])
    
    # Find max length per half position (left and right separately)
    max_left = max(len(half) for half in all_halves[::2]) if all_halves[::2] else 0
    max_right = max(len(half) for half in all_halves[1::2]) if all_halves[1::2] else 0
    
    # Balance each pair
    balanced_pairs = []
    for left_orig, right_orig in pairs:
        left_balanced = distribute_tatweel_per_word(left_orig, max_left)
        right_balanced = distribute_tatweel_per_word(right_orig, max_right)
        balanced_pairs.append((left_balanced, right_balanced))
    
    # Reconstruct with 5 spaces
    result_lines = []
    for left, right in balanced_pairs:
        result_lines.append(f"{left}     {right}")
    
    return '\n'.join(result_lines)

def distribute_tatweel_per_word(text, target_length):
    current = text
    words = re.findall(r'\S+|\s+', current)  # Split into words and spaces
    
    # First pass: identify all valid connection points in each word
    connection_points = {}
    for i, word in enumerate(words):
        if not word.isspace():
            points = []
            for pos in range(len(word)-1):
                char1 = word[pos]
                char2 = word[pos+1]
                # Check if char1 is a connecting letter and char2 is an Arabic letter
                if char1 in CONNECTING_LETTERS and '\u0627' <= char2 <= '\u064a':
                    points.append(pos+1)  # Position after char1
            connection_points[i] = points
    
    while len(current) < target_length:
        # Find shortest non-space word with available connection points
        min_len = float('inf')
        target_idx = -1
        
        for i, word in enumerate(words):
            if not word.isspace() and i in connection_points and connection_points[i]:
                if len(word) < min_len:
                    min_len = len(word)
                    target_idx = i
        
        if target_idx != -1 and connection_points[target_idx]:
            # Add tatweel at the first available connection point
            pos = connection_points[target_idx][0]
            words[target_idx] = words[target_idx][:pos] + 'ـ' + words[target_idx][pos:]
            # Update connection points (shift positions after insertion)
            for p_idx in range(len(connection_points[target_idx])):
                if connection_points[target_idx][p_idx] >= pos:
                    connection_points[target_idx][p_idx] += 1
            current = ''.join(words)
        else:
            # If no valid connection points, add at end of shortest word
            min_len = float('inf')
            end_idx = -1
            for i, word in enumerate(words):
                if not word.isspace() and len(word) < min_len:
                    min_len = len(word)
                    end_idx = i
            if end_idx != -1:
                words[end_idx] += 'ـ'
                current = ''.join(words)
            else:
                break
    
    return current

# Read CSV
df = pd.read_csv('Poetry.csv')

# Process only 'Poem_line_raw' column
for idx, poem_text in enumerate(df['Poem_line_raw']):
    if pd.notna(poem_text):  # Skip NaN values
        formatted = format_poem_for_visual_balance(poem_text)
        df.iloc[idx, df.columns.get_loc('Poem_line_raw')] = formatted

# Save back to CSV
df.to_csv('output_file.csv', index=False)