## Parse data


In [1]:
import warnings
from typing import List
import math

import pandas as pd
from glob import glob

# import re
from bs4 import BeautifulSoup

warnings.filterwarnings("ignore", category=DeprecationWarning)

# Get a list of all xlsx files
files = sorted(glob("../typedb-schema/CPs exp 2a data/*.xlsx"))

data = {}


def change_class_name(class_name: str) -> str:
    """Change the class name to a more readable format.

    Args:
        class_name (str): The class name to be changed.

    Returns:
        str: The changed class name.

    """
    return {
        "agents": "actor",
        "objects": "object",
        "actions": "action",
        "locations": "location",
    }[class_name]


print(f"number of files (participants): {len(files)}\n")
# Loop through the list of files and read each file
for file in files:
    print(f"processing `{file}` ...")

    participant = file.split("/")[-1].split("_")[0]

    dataframe = pd.read_excel(file)
    dataframe.drop_duplicates(subset="name", keep="last", inplace=True)
    # Now you can do something with the data
    assert len(dataframe.situation) == len(dataframe.actions)
    print(
        f"There are in total of {len(dataframe.situation)} situation-action pairs "
        f"in the file {file}, \nparticipant number: {participant}\n"
    )

    situations = []
    actions = []
    data[participant] = []
    for situation, action in zip(dataframe.situation, dataframe.actions):
        # Parse HTML content
        situation_parsed = []

        if isinstance(situation, float) and math.isnan(situation):
            situation_parsed = None

        else:
            soup = BeautifulSoup(situation, "html.parser")

            # Extract data
            for div in soup.find_all("div", class_="item"):
                # Extract class and text content

                class_name = (
                    div.get("class")[1]
                    if len(div.get("class")) > 1
                    else div.get("class")[0]
                )
                # Process the text content more carefully to add spaces around buttons
                # or special divs
                texts = div.find_all(text=True)
                text_content = " ".join(text.strip() for text in texts).strip().lower()
                if class_name != "counters":  # we don't use this anymore
                    situation_parsed.append(
                        [change_class_name(class_name), text_content]
                    )

        situations.append(situation_parsed)

        soup = BeautifulSoup(action, "html.parser")
        action_parsed = {}

        # Extract boxes
        boxes = soup.find_all("div", class_="box box_action")

        for box in boxes:
            box_id = box.get("id")
            action_parsed[box_id] = []

            items = box.find_all("div", class_="item")
            for item in items:
                class_name = (
                    item.get("class")[1]
                    if len(item.get("class")) > 1
                    else item.get("class")[0]
                )
                # Process the text content more carefully to add spaces around buttons
                # or special divs
                texts = item.find_all(text=True)
                text_content = " ".join(text.strip() for text in texts).strip().lower()
                if class_name != "counters":  # we don't use this anymore
                    action_parsed[box_id].append(
                        [change_class_name(class_name), text_content]
                    )

        actions.append(action_parsed)
        assert len(situations) == len(actions)
        data[participant].append(
            {"situation": situation_parsed, "what_we_do": action_parsed}
        )

    # data[participant] = list(zip(situations, actions))

number of files (participants): 0



## Make the data a bit more explainable


In [2]:
data_better = {}
weird = []
participants = sorted(list(data.keys()))

for participant_id, data_list in data.items():

    data_better[participant_id] = []
    for i, data_dict in enumerate(data_list):

        human_macro_actions, robot_macro_actions = [], []
        for box_id, list_of_tuples in data_dict["what_we_do"].items():
            if ["actor", "robot"] in list_of_tuples:
                robot_macro_actions.append(list_of_tuples)
            elif ["actor", "human"] in list_of_tuples:
                human_macro_actions.append(list_of_tuples)
            else:
                weird.append(list_of_tuples)

        data_better[participant_id].append(
            {
                "situation_graph": data_dict["situation"],
                "human_action_graph": human_macro_actions,
                "robot_action_graph": robot_macro_actions,
            }
        )
data = data_better

In [3]:
for participant_id, list_ in data.items():
    print(f"Participant {participant_id}")
    for i, data_dict in enumerate(list_):
        print(f"Situation {i:02}")
        print(f"Situation graph: {data_dict['situation_graph']}")
        print(f"Human action graphs: {data_dict['human_action_graph']}")
        print(f"Robot action graphs: {data_dict['robot_action_graph']}")
        print()

## convert the python dict to RDF/Turtle data


In [4]:
from typing import List
from collections import defaultdict


def order_types(list_of_type_and_literal: List[List[str]]) -> List[List[str]]:
    """Order the types and literals in a list of lists.

    Args:
        list_of_type_and_literal (List[List[str]]): A list of lists containing the types and literals

    Returns:
        List[List[str]]: A list of lists containing the ordered types and literals

    """
    # Define the order of the types
    order = ["actor", "action", "object", "location"]

    assert all(
        x[0] in order for x in list_of_type_and_literal
    ), f"Unknown type: {list_of_type_and_literal}"

    return sorted(list_of_type_and_literal, key=lambda x: order.index(x[0]))


def type_and_literal_to_turtle(type_: str, literal: str) -> str:
    """Convert a type and literal to a Turtle string.

    Args:
        type_ (str): The type
        literal (str): The literal

    Returns:
        str: The Turtle string

    """
    # literal = f'"{", ".join(literal)}"'
    return f'    colearn:{type_}Type "{literal}" ;'


def is_valid_actions_in_graph(graph: List[List[str]]) -> bool:
    """Check if the actions in the graph are valid.

    Args:
        graph (List[List[str]]): A list of lists containing the graph

    Returns:
        bool: True if the actions are valid, False otherwise

    """
    for action in graph:
        if len(action) > 0:
            return True

    return False


def add_human_or_robot_action_to_turtle(
    list_of_actions: list,
    participant_id: str,
    situation_id: str,
    human_or_robot: str,
) -> List[str]:

    num_actions = len(list_of_actions)
    turtle_content = []

    actor = "Human" if human_or_robot.lower() == "human" else "Robot"
    prefix = "h" if human_or_robot.lower() == "human" else "r"

    turtle_content.append(f"# Define the {actor} actions.")

    for idx_a, action in enumerate(list_of_actions):
        action_id = f"{prefix}{idx_a:02}"
        next_action_id = f"{prefix}{idx_a+1:02}"
        previous_action_id = f"{prefix}{idx_a-1:02}"

        turtle_content.append(
            f"colearn:{participant_id}_{situation_id}_{action_id} a colearn:{actor}Action ;"
        )
        turtle_content.append(
            f'    rdfs:label "Participant {participant_id}, Situation {situation_id}, {actor}-Action {action_id}" ;'
        )
        list_of_type_and_literal = order_types(action)
        for type_, literal in list_of_type_and_literal:
            turtle_content.append(type_and_literal_to_turtle(type_, literal))

        if idx_a == 0 and num_actions > 1:
            turtle_content.append(
                f"    colearn:isActionOf colearn:{participant_id}_{situation_id} ;"
            )
            turtle_content.append(
                f"    colearn:hasNextAction colearn:{participant_id}_{situation_id}_{next_action_id} .\n"
            )
        elif idx_a == 0 and num_actions == 1:
            turtle_content.append(
                f"    colearn:isActionOf colearn:{participant_id}_{situation_id} .\n"
            )
        elif idx_a > 0 and idx_a < num_actions - 1:
            turtle_content.append(
                f"    colearn:isActionOf colearn:{participant_id}_{situation_id} ;"
            )
            turtle_content.append(
                f"    colearn:hasNextAction colearn:{participant_id}_{situation_id}_{next_action_id} ;"
            )
            turtle_content.append(
                f"    colearn:hasPreviousAction colearn:{participant_id}_{situation_id}_{previous_action_id} .\n"
            )
        elif idx_a == num_actions - 1:
            turtle_content.append(
                f"    colearn:isActionOf colearn:{participant_id}_{situation_id} ;"
            )
            turtle_content.append(
                f"    colearn:hasPreviousAction colearn:{participant_id}_{situation_id}_{previous_action_id} .\n"
            )
        else:
            raise ValueError(
                f"Unknown {actor} action for participant {participant_id}, situation {situation_id}, action {action_id}"
            )

    return turtle_content


def create_turtle(participant_id: str, data: List[dict]) -> List[str]:
    """Create a Turtle file from the data.

    Args:
        participant_id (str): The participant ID
        data (List[dict]): A list of dictionaries containing the data

    Returns:
        List[str]: A list of strings containing the Turtle content

    """
    # Define the header and namespaces
    turtle_content = [
        "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .",
        "@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .",
        "@prefix owl: <http://www.w3.org/2002/07/owl#> .",
        "@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .",
        "@prefix colearn: <http://example.org/colearn#> .",
        "",
        # Define the main participant
        f"# Define a participant",
        f"colearn:{participant_id} a colearn:Participant ;",
        f'    rdfs:label "Participant {participant_id}" .',
        f"",
    ]

    for idx_s, situation in enumerate(data):

        is_valid_situation = False if situation["situation_graph"] is None else True
        is_human_action = is_valid_actions_in_graph(situation["human_action_graph"])
        is_robot_action = is_valid_actions_in_graph(situation["robot_action_graph"])

        if not is_valid_situation:
            continue
        if not is_human_action and not is_robot_action:
            continue

        situation_id = f"s{idx_s:02}"
        turtle_content.append(f"# A new situation {participant_id}_{situation_id} starts")
        turtle_content.append(
            f"colearn:{participant_id} colearn:hasSituation colearn:{participant_id}_{situation_id} .\n"
        )
        turtle_content.append(f"# Define a situation.")
        turtle_content.append(
            f"colearn:{participant_id}_{situation_id} a colearn:Situation ;"
        )
        turtle_content.append(
            f'    rdfs:label "Participant {participant_id}, Situation {situation_id}" ;'
        )
        turtle_content.append(f"    colearn:isSituationOf colearn:{participant_id} ;")

        list_of_type_and_literal = order_types(situation["situation_graph"])
        for type_, literal in list_of_type_and_literal:
            turtle_content.append(type_and_literal_to_turtle(type_, literal))

        if is_human_action and is_robot_action:
            turtle_content.append(
                f"    colearn:hasHumanAction colearn:{participant_id}_{situation_id}_h00 ;"
            )
            turtle_content.append(
                f"    colearn:hasRobotAction colearn:{participant_id}_{situation_id}_r00 .\n"
            )
        elif is_human_action and not is_robot_action:
            turtle_content.append(
                f"    colearn:hasHumanAction colearn:{participant_id}_{situation_id}_h00 .\n"
            )
        elif not is_human_action and is_robot_action:
            turtle_content.append(
                f"    colearn:hasRobotAction colearn:{participant_id}_{situation_id}_r00 .\n"
            )
        else:
            continue

        if is_human_action:
            content_human = add_human_or_robot_action_to_turtle(
                situation["human_action_graph"],
                participant_id,
                situation_id,
                "Human",
            )
            turtle_content.extend(content_human)

        if is_robot_action:
            content_robot = add_human_or_robot_action_to_turtle(
                situation["robot_action_graph"],
                participant_id,
                situation_id,
                "Robot",
            )
            turtle_content.extend(content_robot)

    return turtle_content


def save_turtle_file(filename: str, content: List[str]):
    """Save the Turtle content to a file.

    Args:
        filename (str): The filename
        content (List[str]): The content to be saved

    """
    with open(filename, "w", encoding="utf-8") as f:
        for line in content:
            f.write(line + "\n")
    print(f"Turtle data saved to {filename}")


for participant_id, list_of_situations in data.items():
    # Generate Turtle content from data
    turtle_content = create_turtle(participant_id, list_of_situations)

    # Save to a Turtle file
    save_turtle_file(f"./data/{participant_id}.ttl", turtle_content)