In [6]:
import pandas as pd
import json

# Load the CSV file
df_csv = pd.read_csv("tockler-export.csv", sep=';')

# Load the JSON file
json_path = "academic_staff_results_20250509_075632.json"
with open(json_path, "r", encoding="utf-8") as f:
    data_json = json.load(f)

# Extract the reviewed sample from step 4
reviewed_sample = data_json["step4"]["reviewed_sample"]

# Show the first few rows of the CSV and one entry from the reviewed sample to understand the format
df_csv.head(), reviewed_sample[0]


(                App          Type  \
 0  Windows Explorer  AppTrackItem   
 1    Microsoft Edge  AppTrackItem   
 2    Microsoft Edge  AppTrackItem   
 3    Microsoft Edge  AppTrackItem   
 4    Microsoft Edge  AppTrackItem   
 
                                                Title                Begin  \
 0                                    UnlockingWindow  2025-01-02 10:03:35   
 1  Editing A Process Analyst’s Guide to Renovatin...  2025-01-02 10:03:38   
 2  Editing A Process Analyst’s Guide to Renovatin...  2025-01-02 10:05:21   
 3  BPMN Editor | bpmn-js modeler Demo | demo.bpmn...  2025-01-02 10:05:30   
 4  Adobe Express and 9 more pages - School - Micr...  2025-01-02 10:05:33   
 
                    End  
 0  2025-01-02 10:03:38  
 1  2025-01-02 10:04:41  
 2  2025-01-02 10:05:30  
 3  2025-01-02 10:05:33  
 4  2025-01-02 10:05:36  ,
 {'title': 'Group sessie (fysiek) Process Science - Hajo Reijers (Software-Informatica) - Meeting - Calendar',
  'activities': ['collaborate wi

In [8]:
import json
from datetime import datetime

# Load reviewed sample from JSON
reviewed_sample = data_json["step4"]["reviewed_sample"]

# Create lookup from title to activity and objects
title_map = {entry["title"]: (entry["activities"], entry["objects"]) for entry in reviewed_sample}

# Initialize OCEL 2.0 structure
ocel = {
    "eventTypes": [],
    "objectTypes": [],
    "events": [],
    "objects": []
}

# Unique sets for defining schema
event_types_set = set()
object_types_dict = {}

# Track object IDs to avoid duplication
object_id_map = {}
event_counter = 1
object_counter = 1

# Set to keep track of processed titles
processed_titles = set()

# --- Process known reviewed titles ---
for _, row in df_csv.iterrows():
    title = row["Title"]
    if title in title_map:
        activities, objects = title_map[title]
        start_time = row["Begin"]
        end_time = row["End"]
        processed_titles.add(title)

        for activity in activities:
            event_id = f"e{event_counter}"
            event_counter += 1
            ocel["events"].append({
                "id": event_id,
                "type": activity,
                "time": start_time,
                "attributes": [],
                "relationships": []
            })
            event_types_set.add(activity)

            for obj in objects:
                if obj not in object_id_map:
                    obj_id = f"o{object_counter}"
                    object_id_map[obj] = obj_id
                    object_counter += 1

                    # Try to find object type from confirmed_objects
                    obj_type = next((s["object_type"] for s in data_json["step3"]["confirmed_objects"] if s["object"] == obj), "unknown")

                    if obj_type not in object_types_dict:
                        object_types_dict[obj_type] = set()
                    object_types_dict[obj_type].add("name")

                    ocel["objects"].append({
                        "id": obj_id,
                        "type": obj_type,
                        "attributes": [
                            {
                                "name": "name",
                                "time": start_time,
                                "value": obj
                            }
                        ]
                    })

                ocel["events"][-1]["relationships"].append({
                    "objectId": object_id_map[obj],
                    "qualifier": "name"
                })

# --- Process unknown events based on confirmed_objects substrings ---
confirmed_objects = data_json["step3"]["confirmed_objects"]

for _, row in df_csv.iterrows():
    title = str(row["Title"]) if pd.notnull(row["Title"]) else ""
    if title in processed_titles:
        continue  # Skip already handled titles

    for obj_entry in confirmed_objects:
        obj_name = obj_entry["object"]
        obj_type = obj_entry["object_type"]

        if obj_name in title:
            start_time = row["Begin"]

            # Add object if not already in map
            if obj_name not in object_id_map:
                obj_id = f"o{object_counter}"
                object_id_map[obj_name] = obj_id
                object_counter += 1

                if obj_type not in object_types_dict:
                    object_types_dict[obj_type] = set()
                object_types_dict[obj_type].add("name")

                ocel["objects"].append({
                    "id": obj_id,
                    "type": obj_type,
                    "attributes": [
                        {
                            "name": "name",
                            "time": start_time,
                            "value": obj_name
                        }
                    ]
                })

            # Add unknown event
            event_id = f"e{event_counter}"
            event_counter += 1
            ocel["events"].append({
                "id": event_id,
                "type": "Unknown",
                "time": start_time,
                "attributes": [],
                "relationships": [
                    {
                        "objectId": object_id_map[obj_name],
                        "qualifier": "name"
                    }
                ]
            })
            event_types_set.add("Unknown")
            break

# --- Finalize schema ---
ocel["eventTypes"] = [{"name": etype, "attributes": []} for etype in sorted(event_types_set)]
ocel["objectTypes"] = [
    {"name": obj_type, "attributes": [{"name": a, "type": "string"} for a in attrs]} for obj_type, attrs in object_types_dict.items()
]

# --- Save OCEL log ---
with open("ocel_log.json", "w", encoding="utf-8") as f:
    json.dump(ocel, f, indent=2)

print("OCEL 2.0 log saved to 'ocel_log.json'")


OCEL 2.0 log saved to 'ocel_log.json'


In [9]:
import pandas as pd
from collections import Counter

# --- Count event types ---
event_type_counts = Counter(event["type"] for event in ocel["events"])
df_event_types = pd.DataFrame(event_type_counts.items(), columns=["Event Type", "Count"]).sort_values(by="Count", ascending=False)

# --- Count object types from relationships ---
object_id_to_type = {obj["id"]: obj["type"] for obj in ocel["objects"]}

object_type_counts = Counter()
for event in ocel["events"]:
    for rel in event["relationships"]:
        obj_id = rel["objectId"]
        obj_type = object_id_to_type.get(obj_id)
        if obj_type:
            object_type_counts[obj_type] += 1

df_object_types = pd.DataFrame(object_type_counts.items(), columns=["Object Type", "Count"]).sort_values(by="Count", ascending=False)

# Display both summaries
print("Event Type Frequencies:")
print(df_event_types)

print("\nObject Type Frequencies in Relationships:")
print(df_object_types)


Event Type Frequencies:
                     Event Type  Count
11                      Unknown   6467
5   collaborate with colleagues     62
0      manage research projects     55
10        analyze research data     32
2                   grade exams     26
3            attend conferences      8
4        present at conferences      8
9   coordinate with departments      6
7     participate in committees      5
1                  design exams      4
8           review publications      4
6    attend department meetings      3

Object Type Frequencies in Relationships:
         Object Type  Count
3  research_projects   2228
4         colleagues   1565
2        conferences   1136
0       publications    928
7           students    430
6         committees    368
8            courses     59
1              exams     30
5        departments     12
