In [2]:
import pandas as pd
import re

# ---------------------------
# 1. Parse the profile section
# ---------------------------
def parse_profile(prompt):
    profile = {
        "age": None,
        "gender": None,
        "income": None,
        "education": None,
        "political_leaning": None,
        "religious_level": None,
        "donated_before": None,
        "donation_frequency": None,
        "trust_charities": None,
        "charities_rated_by_effectiveness": None,
        "objective_measures_help": None,
        "charities_match_my_values": None,
        "country": None,
        "province": None
    }

    # Line-wise extraction
    lines = prompt.split("\n")
    for line in lines:
        line = line.strip()

        if line.startswith("- Age:"):
            profile["age"] = float(line.split(":")[1].strip())
        elif line.startswith("- Gender:"):
            profile["gender"] = line.split(":")[1].strip()
        elif line.startswith("- Income:"):
            profile["income"] = float(line.split(":")[1].strip())
        elif line.startswith("- Education:"):
            profile["education"] = line.split(":")[1].strip()
        elif line.startswith("- Political leaning"):
            profile["political_leaning"] = float(line.split(":")[1].strip())
        elif line.startswith("- Religious level"):
            profile["religious_level"] = float(line.split(":")[1].strip())
        elif line.startswith("- Donated before:"):
            profile["donated_before"] = line.split(":")[1].strip()
        elif line.startswith("- Donation frequency:"):
            profile["donation_frequency"] = line.split(":")[1].strip()
        elif "I trust charitable organizations" in line:
            profile["trust_charities"] = float(line.split(":")[-1].strip())
        elif "Charities can be rated by effectiveness" in line:
            profile["charities_rated_by_effectiveness"] = float(line.split(":")[-1].strip())
        elif "Objective measures help choose charities" in line:
            profile["objective_measures_help"] = float(line.split(":")[-1].strip())
        elif "I choose charities matching my values" in line:
            profile["charities_match_my_values"] = float(line.split(":")[-1].strip())
        elif "- Country:" in line:
            match = re.search(r'Country: (.*), Province: (.*)', line)
            if match:
                profile["country"] = match.group(1).strip()
                profile["province"] = match.group(2).strip()

    return profile

# ---------------------------
# 2. Parse donation options
# ---------------------------
def parse_donation_prompt(prompt):
    parsed = {
        "gender": None,
        "age": None,
        "identifiability": None,
        "relatedness": None,
        "num_recipients": None,
        "cause": None,
        "brand": None,
        "location": None
    }

    ages = ["child", "adult", "senior"]
    relatedness = ["self", "relative", "stranger"]
    identifiabilities = ["named", "unnamed"]
    genders = ["male", "female"]
    causes = ["nutritious meals", "medication", "clean water", "assault victim support"]
    brands = ["unnamed", "low-recognition", "high-recognition"]
    locations = [
        "North America", "South America", "Central America", "Western Europe",
        "Eastern Europe", "North Africa", "South Africa", "Central Africa",
        "East Asia", "SouthEast Asia"
    ]

    match = re.search(r'provides (.*?) to (\d+)', prompt)
    if match:
        parsed["cause"] = match.group(1).strip()
        parsed["num_recipients"] = int(match.group(2))

    for word in prompt.split():
        word = word.strip().lower()
        if word in ages:
            parsed["age"] = word
        elif word in relatedness:
            parsed["relatedness"] = word
        elif word in identifiabilities:
            parsed["identifiability"] = word
        elif word in genders:
            parsed["gender"] = word
        elif word in brands:
            parsed["brand"] = word

    for loc in locations:
        if loc in prompt:
            parsed["location"] = loc
            break

    return parsed

# ---------------------------
# 3. Load the dataset and process
# ---------------------------
df = pd.read_csv("/content/donation_choices_llama3.csv")  # Replace with your CSV file path

parsed_data = []
for _, row in df.iterrows():
    profile_info = parse_profile(row["prompt"])
    option_A_info = parse_donation_prompt(row["option_A"])
    option_B_info = parse_donation_prompt(row["option_B"])
    option_A_region = row["option_A_region"]
    option_B_region = row["option_B_region"]

    combined_row = {
        **profile_info,
        **{f"A_{k}": v for k, v in option_A_info.items()},
        "option_A_region": option_A_region,
        **{f"B_{k}": v for k, v in option_B_info.items()},
        "option_B_region": option_B_region,
        "chosen_option": row["chosen_option"]
    }
    parsed_data.append(combined_row)

# Create final DataFrame
df_final = pd.DataFrame(parsed_data)

# ✅ Save if needed
df_final.to_csv("structured_donation_choices_llama3_full.csv", index=False)
print("Saved parsed output to 'structured_donation_choices_llama3_full.csv'")


Saved parsed output to 'structured_donation_choices_llama3_full.csv'


In [3]:
data = pd.read_csv("structured_donation_choices_llama3_full.csv")


In [None]:
data.head(10)

Unnamed: 0,age,gender,income,education,political_leaning,religious_level,donated_before,donation_frequency,trust_charities,charities_rated_by_effectiveness,...,B_gender,B_age,B_identifiability,B_relatedness,B_num_recipients,B_cause,B_brand,B_location,option_B_region,chosen_option
0,18.0,Male,119157.81,High school diploma,5.0,0.0,Yes,Less than once a month,0.73,0.6,...,,child,unnamed,,46,medication,,Central Africa,far,Option A
1,18.0,Male,119157.81,High school diploma,5.0,0.0,Yes,Less than once a month,0.73,0.6,...,,adult,,,96,assault victim support,high-recognition,North America,far,Option A
2,18.0,Male,119157.81,High school diploma,5.0,0.0,Yes,Less than once a month,0.73,0.6,...,,senior,unnamed,,59,nutritious meals,,Western Europe,far,Option A
3,18.0,Male,119157.81,High school diploma,5.0,0.0,Yes,Less than once a month,0.73,0.6,...,,senior,,,285,assault victim support,high-recognition,Central America,far,Option A
4,18.0,Male,119157.81,High school diploma,5.0,0.0,Yes,Less than once a month,0.73,0.6,...,,senior,,,32,medication,low-recognition,North America,far,Option A
5,18.0,Male,119157.81,High school diploma,5.0,0.0,Yes,Less than once a month,0.73,0.6,...,female,,unnamed,,197,medication,,South America,far,Option A
6,18.0,Male,119157.81,High school diploma,5.0,0.0,Yes,Less than once a month,0.73,0.6,...,,,unnamed,,180,assault victim support,high-recognition,Western Europe,far,Option A
7,18.0,Male,119157.81,High school diploma,5.0,0.0,Yes,Less than once a month,0.73,0.6,...,,child,unnamed,,137,clean water,,North America,far,Option A
8,18.0,Male,119157.81,High school diploma,5.0,0.0,Yes,Less than once a month,0.73,0.6,...,,adult,,,115,assault victim support,low-recognition,North Africa,far,Option A
9,18.0,Male,119157.81,High school diploma,5.0,0.0,Yes,Less than once a month,0.73,0.6,...,male,,,,191,clean water,low-recognition,Central America,far,Option A
