## UDHR UN Data

In [17]:
import xml.etree.ElementTree as ET
import pandas as pd
import os

def get_ns(tag):
    if tag[0] == "{":
        return tag[1:].split("}")[0]
    return ""

def extract_articles(root):
    # Find namespace from root tag
    ns = get_ns(root.tag)
    ns_prefix = f"{{{ns}}}" if ns else ""

    # Find the <udhr> element
    if root.tag == f"{ns_prefix}udhr":
        udhr = root
    else:
        udhr = root.find(f".//{ns_prefix}udhr")
    if udhr is None:
        raise ValueError("Could not find <udhr> element in XML tree.")

    # Extract display language name, ISO code
    lang_display = udhr.attrib.get("n", "")  # This is your pretty language name!
    iso_code = udhr.attrib.get("key", "")

    # Find all <article> elements (may be direct or under <articles>)
    articles = udhr.findall(f".//{ns_prefix}article")
    data = []
    for article in articles:
        art_id = article.attrib["number"]

        # All <para> (direct or in listitems)
        paras = []
        for para in article.findall(f"{ns_prefix}para"):
            paras.append("".join(para.itertext()).strip())
        for li in article.findall(f".//{ns_prefix}orderedlist/{ns_prefix}listitem/{ns_prefix}para"):
            paras.append("".join(li.itertext()).strip())

        # Compose full text
        full_text = " ".join(paras)
        data.append({
            "article_num": art_id,
            "article_text": full_text,
            "lang_display": lang_display,
            "lang_code": iso_code,
        })
    return data


def build_udhr_df(lang_code):
    file_path = f"data/udhr_raw/udhr_{lang_code}.xml"
    if not os.path.exists(file_path):
        raise ValueError(f"File not found for language code: {lang_code} at {file_path}")

    with open(file_path, "r", encoding="utf-8") as f:
        data = f.read()
    root = ET.fromstring(data)
    articles = extract_articles(root)
    df = pd.DataFrame(articles)
    df['article_num'] = pd.to_numeric(df['article_num'], errors='coerce').astype(int)
    return df

# Example usage:
en_df = build_udhr_df("eng")
zh_df = build_udhr_df("cmn_hans")

# Save the DataFrames to CSV files
# en_df.to_csv("data/udhr/udhr_eng.csv", index=False)
# zh_df.to_csv("data/udhr/udhr_cmn_hans.csv", index=False)

df = pd.concat([en_df, zh_df], ignore_index=True)
df = df.sort_values(["lang_code", "article_num"]).reset_index(drop=True)
df.head()

Unnamed: 0,article_num,article_text,lang_display,lang_code
0,1,"人人生而自由,在尊严和权利上一律平等。他们赋有理性和良心,并应以兄弟关系的精神相对待。","Chinese, Mandarin (Simplified)",cmn_hans
1,2,"人人有资格享有本宣言所载的一切权利和自由,不分种族、肤色、性别、语言、宗教、政治或其他见解、...","Chinese, Mandarin (Simplified)",cmn_hans
2,3,人人有权享有生命、自由和人身安全。,"Chinese, Mandarin (Simplified)",cmn_hans
3,4,"任何人不得使为奴隶或奴役;一切形式的奴隶制度和奴隶买卖,均应予以禁止。","Chinese, Mandarin (Simplified)",cmn_hans
4,5,"任何人不得加以酷刑,或施以残忍的、不人道的或侮辱性的待遇或刑罚。","Chinese, Mandarin (Simplified)",cmn_hans


## Create GPT Paraphrases

In [None]:
RUN_PARAPHRASE = False  # Set to True to run the paraphrasing step
# RUN_PARAPHRASE = True

In [None]:
if RUN_PARAPHRASE:

    from dotenv import load_dotenv
    from openai import OpenAI
    import os

    load_dotenv(override=True)
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

    client = OpenAI(
        # base_url="https://openrouter.ai/api/v1",
        api_key=OPENAI_API_KEY,
    )

    system_prompt = "You are a careful legal writer."
    scenario_prompt_template = """
    Your task is to produce paraphrases of articles from the Universal Declaration of Human Rights (UDHR).

    Article:
    {article_num}

    Article text:
    {article_text}

    Instructions:
    1. Produce 5 paraphrases of the article text.
    2. Keep the exact rights and obligations intact.
    3. Preserve legal meaning and intent.
    4. Vary structure and vocabulary.
    5. Output exactly ONE complete sentence per paraphrase (~20-30 words each).
    6. Do not include any additional text or explanations.
    7. Follow the output format EXACTLY.

    Please provide the paraphrases in the following JSON format:
    ```json
    {{
        "para_1: "paraphrase_1",
        "para_2": "paraphrase_2",
        "para_3": "paraphrase_3",
        "para_4": "paraphrase_4",
        "para_5": "paraphrase_5"
    }}
    ```
    """.strip()

In [None]:
if RUN_PARAPHRASE:

    import asyncio
    import pandas as pd
    import re
    import json
    from tqdm.asyncio import tqdm_asyncio

    CONCURRENCY_LIMIT = 5  # Adjust as needed

    def parse_paraphrase_json(json_str):
        """Parse the JSON response from OpenAI and extract paraphrases"""
        try:
            # Extract JSON from markdown code blocks if present
            json_match = re.search(r'```json\s*(.*?)\s*```', json_str, re.DOTALL)
            if json_match:
                json_str = json_match.group(1)

            # Parse JSON
            data = json.loads(json_str)

            # Extract paraphrases (handle potential typo in para_1)
            paraphrases = []
            for i in range(1, 6):
                key = f"para_{i}"
                # Handle potential typo in first key
                if key not in data and i == 1:
                    key = "para_1:"  # Handle the typo in the template
                if key in data:
                    paraphrases.append(data[key])
                else:
                    paraphrases.append("")

            return paraphrases
        except Exception as e:
            print(f"Error parsing JSON: {e}")
            return ["", "", "", "", ""]

    async def async_generate_paraphrases(article_num, article_text):
        prompt = scenario_prompt_template.format(
            article_num=article_num,
            article_text=article_text
        )

        loop = asyncio.get_event_loop()
        response = await loop.run_in_executor(
            None,
            lambda: client.chat.completions.create(
                model="gpt-4o-2024-11-20",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.7,
                max_tokens=1000,
                seed=42,
            )
        )
        return response.choices[0].message.content

    async def process_row(row, semaphore):
        async with semaphore:
            try:
                result = await async_generate_paraphrases(row['article_num'], row['article_text'])
                paraphrases = parse_paraphrase_json(result)
                return [
                    {
                        'article_num': row['article_num'],
                        'original_text': row['article_text'],
                        'paraphrase_num': i,
                        'paraphrase_text': paraphrase,
                        'lang_display': row['lang_display'],
                        'lang_code': row['lang_code']
                    }
                    for i, paraphrase in enumerate(paraphrases, 1)
                ]
            except Exception as e:
                print(f"Error processing article {row['article_num']}: {e}")
                return [
                    {
                        'article_num': row['article_num'],
                        'original_text': row['article_text'],
                        'paraphrase_num': i,
                        'paraphrase_text': "",
                        'lang_display': row['lang_display'],
                        'lang_code': row['lang_code']
                    }
                    for i in range(1, 6)
                ]

    async def main_async_paraphrase(en_df):
        semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)
        tasks = [process_row(row, semaphore) for _, row in en_df.iterrows()]
        all_results = []
        for paraphrase_lists in await tqdm_asyncio.gather(*tasks):
            all_results.extend(paraphrase_lists)
        return all_results

    # To run:
    all_paraphrases = await main_async_paraphrase(en_df)
    paraphrases_df_long = pd.DataFrame(all_paraphrases)
    # print(f"Generated {len(paraphrases_df_long)} paraphrase records")

    paraphrases_df = paraphrases_df_long.pivot_table(
        index=['article_num', 'original_text'],
        columns='paraphrase_num',
        values='paraphrase_text',
        aggfunc='first'
    ).reset_index()
    paraphrases_df.columns.name = None

    # Rename columns for clarity
    paraphrases_df = paraphrases_df.rename(columns={
        1: 'para_1', 2: 'para_2', 3: 'para_3', 4: 'para_4', 5: 'para_5',
        'original_text': 'article_text'
    })

    # Reorder columns
    paraphrases_df = paraphrases_df[['article_num', 'article_text', 'para_1', 'para_2', 'para_3', 'para_4', 'para_5']]

    paraphrases_df.to_csv("data/udhr_simplified/udhr_eng_gpt-4o-paraphrases.csv", index=False)
    paraphrases_df.head(5)

Unnamed: 0,article_num,article_text,para_1,para_2,para_3,para_4,para_5
0,1,All human beings are born free and equal in di...,All people are born free and equal in rights a...,Every human being is born free and equal in di...,Human beings are born free and equal in rights...,All individuals are born free and equal in dig...,Every person is born with freedom and equality...
1,2,Everyone is entitled to all the rights and fre...,All individuals are entitled to the rights and...,Every person has the right to enjoy the freedo...,No one shall be denied the rights and freedoms...,The rights and freedoms in this Declaration ap...,Everyone is entitled to the rights in this Dec...
2,3,"Everyone has the right to life, liberty and th...",Every individual is entitled to the rights of ...,"All people possess the right to live, to be fr...","The right to life, freedom, and personal safet...","Each human being has the right to life, libert...","Everyone is guaranteed the right to live, to e..."
3,4,No one shall be held in slavery or servitude; ...,No individual shall be subjected to slavery or...,"Slavery and servitude are prohibited, and the ...",No person shall be kept in slavery or servitud...,"Slavery and forced servitude are forbidden, an...",Everyone is protected from slavery or servitud...
4,5,No one shall be subjected to torture or to cru...,Nobody shall be subjected to acts of torture o...,No individual shall endure torture or any trea...,Everyone is protected from being subjected to ...,"Torture, as well as cruel, inhuman, or degradi...",All persons are entitled to freedom from tortu...


In [None]:
if not RUN_PARAPHRASE:
    paraphrases_df = pd.read_csv("data/udhr_simplified/udhr_eng_gpt-4o-paraphrases.csv")

# Count words in original articles
stats_df = paraphrases_df.copy()
stats_df['original_word_count'] = stats_df['article_text'].str.split().str.len()

# Count words in each paraphrase
stats_df['para_1_word_count'] = stats_df['para_1'].str.split().str.len()
stats_df['para_2_word_count'] = stats_df['para_2'].str.split().str.len()
stats_df['para_3_word_count'] = stats_df['para_3'].str.split().str.len()
stats_df['para_4_word_count'] = stats_df['para_4'].str.split().str.len()
stats_df['para_5_word_count'] = stats_df['para_5'].str.split().str.len()

# Calculate average word count across all paraphrases for each article
stats_df['avg_paraphrase_word_count'] = stats_df[['para_1_word_count', 'para_2_word_count',
                                                              'para_3_word_count', 'para_4_word_count',
                                                              'para_5_word_count']].mean(axis=1)

# Display descriptive statistics
stats_df = stats_df[['article_num', 'original_word_count', 'avg_paraphrase_word_count']].copy()
stats_df['word_count_difference'] = stats_df['avg_paraphrase_word_count'] - stats_df['original_word_count']

print("Descriptive Statistics:")
print(f"Original article word count - Mean: {stats_df['original_word_count'].mean():.1f}, Std: {stats_df['original_word_count'].std():.1f}")
print(f"Average paraphrase word count - Mean: {stats_df['avg_paraphrase_word_count'].mean():.1f}, Std: {stats_df['avg_paraphrase_word_count'].std():.1f}")

# Show detailed breakdown by article
stats_df

Descriptive Statistics:
Original article word count - Mean: 45.4, Std: 29.0
Average paraphrase word count - Mean: 30.5, Std: 12.7


Unnamed: 0,article_num,original_word_count,avg_paraphrase_word_count,word_count_difference
0,1,30,26.4,-3.6
1,2,78,33.8,-44.2
2,3,12,15.4,3.4
3,4,21,20.8,-0.2
4,5,16,18.4,2.4
5,6,13,16.0,3.0
6,7,39,26.2,-12.8
7,8,27,25.8,-1.2
8,9,11,13.0,2.0
9,10,33,28.6,-4.4


## Full UDHR Dataset

Amnesty International simplified articles found at:
- (1) https://www.amnesty.org/en/what-we-do/universal-declaration-of-human-rights/
- (2) https://www.amnestyusa.org/wp-content/uploads/2017/11/Simplified-UDHR.pdf

UDHR Meta (Articles structured according to Cassin's portico):
- https://en.wikipedia.org/wiki/Universal_Declaration_of_Human_Rights

In [None]:
meta = pd.read_csv("data/udhr_simplified/udhr_eng_meta.csv")
para1 = pd.read_csv("data/udhr_simplified/udhr_eng_simplified1_amnesty_int.csv")
para2 = pd.read_csv("data/udhr_simplified/udhr_eng_simplified2_amnesty_int.csv")
para4 = pd.read_csv("data/udhr_simplified/udhr_eng_gpt-4o-paraphrases.csv")

en_df_full = pd.merge(en_df, para1, on="article_num", how="left")
en_df_full = pd.merge(en_df_full, para2, on="article_num", how="left")
en_df_full = pd.merge(en_df_full, meta, on="article_num", how="left")
en_df_full = en_df_full.rename(columns={
    "article_text": "article_text",
    "paraphrase_x": "amnesty_para1",
    "paraphrase_y": "amnesty_para2",
})
en_df_full = en_df_full[["article_num", "article_name", "cp_sec", "cassins_portico", "lang_display", "lang_code", "article_text", "amnesty_para1", "amnesty_para2"]]
en_df_full = pd.merge(en_df_full, paraphrases_df.drop(columns="article_text"), on="article_num", how="left")
en_df_full = en_df_full.rename(columns={
    "para_1": "gpt_para1",
    "para_2": "gpt_para2",
    "para_3": "gpt_para3",
    "para_4": "gpt_para4",
    "para_5": "gpt_para5"
})
en_df_full.to_csv("data/udhr_simplified/udhr_eng_full.csv", index=False)
en_df_full.head()