In [1]:
import json
import pandas as pd
import numpy as np

countries = ["United States", "India", "Pakistan", "Nigeria", "Philippines", "United Kingdom", "Germany", "Uganda", "Canada", "Egypt", "France", "Australia"]
vis_countries = ["US", "India", "Pakistan", "Nigeria", "Philippines", "UK", "Germany", "Uganda", "Canada", "Egypt", "France", "Australia"]

topics = ["Politics", "Social Networks", "Social Inequality", "Family & Changing Gender Roles", "Work Orientation", "Religion", "Environment", "National Identity", "Citizenship", "Leisure Time and Sports", "Health and Health Care"]
vis_topics = ["Politics", "SocialNet", "Inequality", "Family", "Work", "Religion", "Environment", "Identity", "Citizenship", "Leisure", "Health"]


schwartz_values = {
    "Power": ["Social Power", "Authority", "Wealth", "Preserving my Public Image", "Social Recognition"],
    "Achievement": ["Successful", "Capable", "Ambitious", "Influential", "Intelligent", "Self-Respect"],
    "Hedonism": ["Pleasure", "Enjoying Life"],
    "Stimulation": ["Daring", "A Varied Life", "An Exciting Life"],
    "Self-direction": ["Creativity", "Curious", "Freedom", "Choosing Own Goals", "Independent"],
    "Universalism": ["Protecting the Environment", "A World of Beauty", "Broad-Minded", "Social Justice", "Wisdom", "Equality", "A World at Peace", "Inner Harmony", "Unity With Nature"],
    "Benevolence": ["Helpful", "Honest", "Forgiving", "Loyal", "Responsible", "True Friendship", "A Spiritual Life", "Mature Love", "Meaning in Life"],
    "Tradition": ["Devout", "Accepting my Portion in Life", "Humble", "Moderate", "Respect for Tradition", "Detachment"],
    "Conformity": ["Politeness", "Honoring of Parents and Elders", "Obedient", "Self-Discipline"],
    "Security": ["Clean", "National Security", "Social Order", "Family Security", "Reciprocation of Favors", "Healthy", "Sense of Belonging"]
}

def get_value_list(schwartz_values):
    value_list = []
    for key, value in schwartz_values.items():
        value_list.extend([f"{value}" for value in value])
    return value_list
value_list = get_value_list(schwartz_values)
print(len(value_list))

def get_scenario_list(countries, topics):
    scenarios_list = []
    for country in countries:
        for topic in topics:
            scenarios_list.append(f"{country}+{topic}")
    return scenarios_list
scenarios_list = get_scenario_list(countries, topics)
print(len(scenarios_list))



56
132


### Check Task1

In [8]:
def clean_generation(response: str) -> str:
    """Extract the task1's results in json format."""
    if "```" in response:
        sub1 = "```json"
        sub2 = "```"
        response = ''.join(response.split(sub1)[1].split(sub2)[0])
        return response
    else:
        return response
        

def clean_generation_without_json(response: str) -> str:
    """Extract the task1's results in json format."""
    if "```" in response:
        sub1 = "```"
        sub2 = "```"
        response = ''.join(response.split(sub1)[1].split(sub2)[0])
        return response
    else:
        return response


def clean_value_response(response: str) -> str:
    """Replaces the response that only reply string without number. Rules are:
       1: very much like me, 2: like me, 3: not like me, 4: Not like me at all """
    response = response.lower().replace("not like me at all", "4").replace("not like me", "3").replace("very much like me", "1").replace("like me", "2")
    return response


def generate_full_t1_table(t1_measures: pd.DataFrame, value_list: list) -> list[list]:
    """Parses the task1's results into dataframe."""
    full_t1_table_pd = []
    parse_errors, value_errors = 0, 0
    for index, row in t1_measures.iterrows():
        country = row['country']
        topic   = row['topic']
        prompt_index = row['prompt_index']
        try:
            response = json.loads(clean_generation(row['response']))
        except Exception as e:
            try:
                response = json.loads(clean_generation_without_json(row['response']))
            except Exception as e:
                parse_errors += 1
                print(f"====== Couldn't Parse Response; Error={e} =======\n")
                print(f"Index={index}; Response={row['response']}")
        value_response_list = []
        for value in value_list:
            try:
                if value in response.keys():
                    value_response_list.append(int(clean_value_response(response[value])[0]))
            except Exception as e:
                print(f"====== Couldn't Extract Values; Error={e} =======\n")
                print(f"Index={index}; value={value} Cleaned Response: {clean_value_response(response[value])}")
                value_errors += 1

        pd_row = [country, topic, prompt_index] + value_response_list
        full_t1_table_pd.append(pd_row)
        
    print(f"Total Parsing Error Count = {parse_errors} (out of 132*8 = 1056); Value Errors Count={value_errors}")
    return full_t1_table_pd

### Check Task2

In [9]:
def generate_full_t2_table(t2_measures: pd.DataFrame, value_list: list, model = 'gpt4o-mini') -> pd.DataFrame:
    """Parses the task2's results into dataframe."""
    full_value_dict = {}
    for index, row in t2_measures.iterrows():
        if row['model_choice'] == True:
            country = row['country']
            topic   = row['topic']
            prompt_index = row['prompt_index']
            key = f"{country}+{topic}+{prompt_index}"
            value = row['value']
            if model == 'gpt4o-mini':
                polarity = 1 if row['polarity'] == 'positive' else 0 ### GPT4o-mini saved the flipped reponses 
            else:
                polarity = 0 if row['polarity'] == 'positive' else 1

            if key in full_value_dict.keys():
                full_value_dict[key][value] = polarity
            else:
                full_value_dict[key] = {value: polarity}

    ### Get the full dictionary
    full_t2_table_pd = []
    missing_value_count = 0
    for key, value_dict in full_value_dict.items():
        country, topic, prompt_index = key.split('+')
        value_response_list = []
        for value in value_list:
            if value in value_dict.keys():
                value_response_list.append(int(value_dict[value]))
            else:
                missing_value_count += 1
                value_response_list.append(0)
                print(f"=== Warning: Missing Value country={country}, topic={topic}, value={value}")
        # value_response_list = [int(value_dict[value]) if value in value_dict.keys() else 0 for value in value_list]
        pd_row = [country, topic, prompt_index] + value_response_list
        full_t2_table_pd.append(pd_row)

    ### TODO: Haven't check the country and topic with full list yet.
    print(f"=== Total Missing Contextual Values = {missing_value_count} out of 14784")
    return full_t2_table_pd


### Start Sanity Check on Data

### Gemma's results:

Task1 errors:

- Total Parsing Error Count = 24 (out of 132*8 = 1056); Value Errors Count=0

Task2 errors:
- Total Missing Contextual Values = 63 out of 14784

In [10]:
t1_measures = pd.read_csv("../../outputs/evaluation/gemma-2-9b-it_t1.csv")
full_t1_responses = pd.DataFrame(generate_full_t1_table(t1_measures, value_list), columns=['country', 'topic', 'prompt_index'] + [f"value_{value}" for value in value_list])
print(len(t1_measures))

t2_measures = pd.read_csv("../../outputs/evaluation/gemma-2-9b-it_t2.csv")
full_t2_responses = pd.DataFrame(generate_full_t2_table(t2_measures, value_list, "gemma"), columns=['country', 'topic', 'prompt_index'] + [f"value_{value}" for value in value_list])
print(len(t2_measures))


Index=154; Response=Here's a possible response reflecting Indian cultural values, presented in JSON format:

```json
{
  "Equality": "2", 
  "Inner Harmony": "1",
  "Social Power": "3",
  "Pleasure": "2",
  "Freedom": "2",
  "A Spiritual Life": "1",
  "Sense of Belonging": "1",
  "Social Order": "1",
  "An Exciting Life": "2",
  "Meaning in Life": "1",
  "Politeness": "1",
  "Wealth": "3",
  "National Security": "1",
  "Self-Respect": "1",
  "Reciprocation of Favors": "1",
  "Creativity": "2",
  "A World at Peace": "1", 
  "Respect for Tradition": "1",
  "Mature Love": "1",
  "Self-Discipline": "1",
  "Detachment": "3",
  "Family Security": "1",
  "Social Recognition": "2",
  "Unity With Nature": "2",
  "A Varied Life": "2",
  "Wisdom": "1",
  "Authority": "3",
  "True Friendship": "1",
  "A World of Beauty": "1",  
  "Social Justice": "1",
  "Independent": "2",
  "Moderate": "1", 
  "Loyal": "1", 
  "Ambitious": "2", 
  "Broad-Minded": "2",
  "Humble": "1", 
  "Daring": "2",
  "Prote

### ChatGPT's results:

Task1 errors:

- Total Parsing Error Count = 0 (out of 132*8 = 1056); Value Errors Count=0

Task2 errors:
- Total Missing Contextual Values = 63 out of 14784

In [11]:
t1_measures = pd.read_csv("../../outputs/evaluation/gpt-3.5-turbo_t1.csv")
full_t1_responses = pd.DataFrame(generate_full_t1_table(t1_measures, value_list), columns=['country', 'topic', 'prompt_index'] + [f"value_{value}" for value in value_list])
print(len(t1_measures))

t2_measures = pd.read_csv("../../outputs/evaluation/gpt-3.5-turbo_t2.csv")
full_t2_responses = pd.DataFrame(generate_full_t2_table(t2_measures, value_list, "gemma"), columns=['country', 'topic', 'prompt_index'] + [f"value_{value}" for value in value_list])
print(len(t2_measures))

Total Parsing Error Count = 0 (out of 132*8 = 1056); Value Errors Count=0
1056
=== Total Missing Contextual Values = 63 out of 14784
14658


### GPT4o's results:

Task1 errors:

- Total Parsing Error Count = 1 (out of 132*8 = 1056); Value Errors Count=0

Task2 errors:
- Total Missing Contextual Values = 198 out of 14784

In [12]:
t1_measures = pd.read_csv("../../outputs/evaluation/gpt-4o-mini_t1.csv")
full_t1_responses = pd.DataFrame(generate_full_t1_table(t1_measures, value_list), columns=['country', 'topic', 'prompt_index'] + [f"value_{value}" for value in value_list])
print(len(t1_measures))

t2_measures = pd.read_csv("../../outputs/evaluation/gpt-4o-mini_t2.csv")
full_t2_responses = pd.DataFrame(generate_full_t2_table(t2_measures, value_list, "gemma"), columns=['country', 'topic', 'prompt_index'] + [f"value_{value}" for value in value_list])
print(len(t2_measures))


Index=1039; Response=```json
{
    "Equality": "1",
    "Inner Harmony": "1",
    "Social Power": "4",
    "Pleasure": "2",
    "Freedom": "1",
    "A Spiritual Life": "2",
    "Sense of Belonging": "1",
    "
Total Parsing Error Count = 1 (out of 132*8 = 1056); Value Errors Count=0
1056
=== Total Missing Contextual Values = 198 out of 14784
14388


### Llama-3.3's results:

Task1 errors:

- Total Parsing Error Count = 0 (out of 132*8 = 1056); Value Errors Count=0

Task2 errors:
- Total Missing Contextual Values = 63 out of 14784

In [13]:
t1_measures = pd.read_csv("../../outputs/evaluation/Llama-3.3-70B-Instruct_t1.csv")
full_t1_responses = pd.DataFrame(generate_full_t1_table(t1_measures, value_list), columns=['country', 'topic', 'prompt_index'] + [f"value_{value}" for value in value_list])
print(len(t1_measures))

t2_measures = pd.read_csv("../../outputs/evaluation/Llama-3.3-70B-Instruct_t2.csv")
full_t2_responses = pd.DataFrame(generate_full_t2_table(t2_measures, value_list, "gemma"), columns=['country', 'topic', 'prompt_index'] + [f"value_{value}" for value in value_list])
print(len(t2_measures))

Total Parsing Error Count = 0 (out of 132*8 = 1056); Value Errors Count=0
1056
=== Total Missing Contextual Values = 63 out of 14784
14658
