First, create a new conda environment named BI2025 and install the required packages from requirements.txt


In [1]:
# !conda create -n BI2025 python=3.11 -y
# !conda activate BI2025
!pip install -r requirements.txt

Collecting starvers@ git+https://github.com/AllStarsAT/starvers.git (from -r requirements.txt (line 4))
  Cloning https://github.com/AllStarsAT/starvers.git to c:\users\ibrar\appdata\local\temp\pip-install-tvbhsyd3\starvers_276e6140e53b4a8186f618d459c79f2d
  Resolved https://github.com/AllStarsAT/starvers.git to commit 5e6e112e2b37cb5a27af9585bd572d27187ef735
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/AllStarsAT/starvers.git 'C:\Users\ibrar\AppData\Local\Temp\pip-install-tvbhsyd3\starvers_276e6140e53b4a8186f618d459c79f2d'


In [2]:
# DO NOT MODIFY OR COPY THIS CELL!! 
# Note: The only imports allowed are Python's standard library, pandas, numpy, scipy, matplotlib, seaborn and scikit-learn
import numpy as np
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import plotly.express as px
import datetime
import typing
import requests
import time
import shutil
import json
from starvers.starvers import TripleStoreEngine

## Graph-based documentation preliminaries

**!!!IMPORTANT!!!**

Everytime you work on this notebook, enter your student ID in the `executed_by` variable so that the cell executions are accredited to you.

In [3]:
executed_by ='stud-id_12350094'  # Replace the digits after "id_" with your own student ID

Set your group and student IDs. Do this only once.

In [4]:
# group id for this project
group_id = '16'  # Replace the digits with your group id

# Students working on this notebook
student_a = 'stud-id_12350094'  # Replace the digits after "id_" with student A's student ID
student_b = 'stud-id_11826186'  # Replace the digits after "id_" with student B's student ID

In [5]:
# Roles. Don't change these values.
code_writer_role = 'code_writer'
code_executor_role = 'code_executor'

Setup the starvers API for logging your steps into our server-sided graph database.

In [6]:
get_endpoint = "https://starvers.ec.tuwien.ac.at/BI2025"
post_endpoint = "https://starvers.ec.tuwien.ac.at/BI2025/statements"
engine = TripleStoreEngine(get_endpoint, post_endpoint, skip_connection_test=True)

Use these prefixes in your notebooks. You can extend this dict with your prefixes of additional ontologies that you use in this notebook. Replace 00 with your group id

In [7]:
prefixes = {
    'xsd': 'http://www.w3.org/2001/XMLSchema#',
    'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
    'foaf': 'http://xmlns.com/foaf/0.1/',
    'prov': 'http://www.w3.org/ns/prov#',
    'sc': 'https://schema.org/',
    'cr': 'http://mlcommons.org/croissant/',
    'mls': 'http://www.w3.org/ns/mls#',
    'mlso': 'http://w3id.org/mlso',
    'siu': 'https://si-digital-framework.org/SI/units/',
    'siq': 'https://si-digital-framework.org/SI/quantities/',
    'qudt': 'http://qudt.org/schema/qudt/',
    '': f'https://starvers.ec.tuwien.ac.at/BI2025/{group_id}/',
}

prefix_header = '\n'.join([f'PREFIX {k}: <{v}>' for k, v in prefixes.items()]) + '\n\n'

Ontologies to use
* Provenance of the experiment process
    * PROV-O: 
        * doc: https://www.w3.org/TR/prov-o/
        * serialization: https://www.w3.org/ns/prov-o
* Data used and created
    * schema.org - Dataset: 
        * doc: https://schema.org/Dataset
        * serialization: https://schema.org/version/latest/schemaorg-current-https.ttl
    * Crossaint
        * doc: https://docs.mlcommons.org/croissant/docs/croissant-spec.html
        * serialization: https://github.com/mlcommons/croissant/blob/main/docs/croissant.ttl
* ML experiments performed
    * MLSO: 
        * doc: https://github.com/dtai-kg/MLSO
        * doc: https://dtai-kg.github.io/MLSO/#http://w3id.org/
        * serialization: https://dtai-kg.github.io/MLSO/ontology.ttl
* Measurements, Metrics, Units
    * QUDT
        * doc:https://qudt.org/
        * doc: https://github.com/qudt/qudt-public-repo
        * serialization: https://github.com/qudt/qudt-public-repo/blob/main/src/main/rdf/schema/SCHEMA_QUDT.ttl
    * SI Digital Framework
        * doc: https://github.com/TheBIPM/SI_Digital_Framework/blob/main/SI_Reference_Point/docs/README.md
        * doc: https://si-digital-framework.org/
        * doc: https://si-digital-framework.org/SI
        * serialization: https://github.com/TheBIPM/SI_Digital_Framework/blob/main/SI_Reference_Point/TTL/si.ttl
    * Quantities and Units
        * doc: https://www.omg.org/spec/Commons
        * serialization: https://www.omg.org/spec/Commons/QuantitiesAndUnits.ttl

Use this function to record execution times.

In [8]:
def now() -> str:
    """
    Returns the current time in ISO 8601 format with UTC timezone in the following format:
    YYYY-MM-DDTHH:MM:SS.sssZ
    """
    timestamp = datetime.datetime.now(datetime.timezone.utc)
    timestamp_formated = timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3]  +"Z"

    return timestamp_formated

Register yourself in the Knowledge Graph using ProvO. Change the given name, family name and immatriculation number to reflect your own data.

In [9]:
# Ontologies used: foaf, prov, IAO
reigstration_triples_a = [
f':{student_a} rdf:type foaf:Person .',
f':{student_a} rdf:type prov:Agent .',
f':{student_a} foaf:givenName "Muhammad" .',
f':{student_a} foaf:familyName "Ibrar" .',
f':{student_a} <http://vivoweb.org/ontology/core#identifier> :{student_a} .',
f':{student_a} rdf:type <http://purl.obolibrary.org/obo/IAO_0000578> .',
f':{student_a} <http://www.w3.org/2000/01/rdf-schema#label> "12350094" .',
f':{student_a} <http://purl.obolibrary.org/obo/IAO_0000219> "01234567"^^xsd:string .',
]

reigstration_triples_b = [
f':{student_b} rdf:type foaf:Person .',
f':{student_b} rdf:type prov:Agent .',
f':{student_b} foaf:givenName "Ahmad" .',
f':{student_b} foaf:familyName "Ibrahim" .',
f':{student_b} <http://vivoweb.org/ontology/core#identifier> :{student_b} .',
f':{student_b} rdf:type <http://purl.obolibrary.org/obo/IAO_0000578> .',
f':{student_b} <http://www.w3.org/2000/01/rdf-schema#label> "11826186" .',
f':{student_b} <http://purl.obolibrary.org/obo/IAO_0000219> "76543210"^^xsd:string .',
]

role_triples = [
    f':{code_writer_role} rdf:type prov:Role .',
    f':{code_executor_role} rdf:type prov:Role .',
]


engine.insert(reigstration_triples_a, prefixes=prefixes)
engine.insert(reigstration_triples_b, prefixes=prefixes)
engine.insert(role_triples, prefixes=prefixes)

HTTPError: HTTP Error 504: Gateway Time-out

**What not do do**

Do not use [blank nodes](https://www.w3.org/wiki/BlankNodes).

PROV-O uses blank nodes to connect multiple elements with each other.
Such blank nodes (such as _:association) should not be used.
Instead, assign a fixed node ID such as
:5119fcd7-b571-41e0-9464-a37c7be0f574 by generating them outside of the
notebook.
We suggest that, for each setting where such a blank node is needed to
connect multiple elements, you create a unique hash (using uuid.uuid4())
and keep this as hard-coded identifier for the blank node. The template
notebook contains examples of this. Do *not* use these provided values,
as otherwise, your provenance documentations will all be connected via
these identifiers!
Also, do not generate them dynamically in every cell execution, e.g. by
using uuid.uuid4() in a cell. This would generate many new linking nodes
for connecting the same elements.
Compute one for each node (cell) where you need them and make sure to
use the same one on each re-execution of the notebook.

In [10]:
# weather_data_path = os.path.join("data", "datasets", "weather")
# cyclists_data_path = os.path.join("data", "datasets", "cyclists")
mobile_price_data_path = os.path.join("data", "datasets", "mobile_price")


## Business Understanding 

In [11]:
## Each Activity that follows is part of the Business Understanding Phase

business_understanding_phase_executor = [
f':business_understanding_phase rdf:type prov:Activity .',
f':business_understanding_phase rdfs:label "Business Understanding Phase" .', ## Phase 1: Business Understanding
]
engine.insert(business_understanding_phase_executor, prefixes=prefixes)


HTTPError: HTTP Error 504: Gateway Time-out

In [None]:
#############################################
# Documentation
#############################################

data_src_and_scenario_comment = """
Data Source:
The dataset is the Kaggle “Mobile Price Classification” dataset containing 2,000 mobile phones. Each phone is described by 20 technical features (battery power, RAM, internal memory, camera specs, connectivity options, screen dimensions, etc.) and one target variable price_range with four classes (0 = low, 3 = very high).

Scenario:
A new mobile company wants to price upcoming phone models competitively against major brands. Instead of relying only on expert judgment, the company wants to analyze historical specifications of phones and their corresponding price ranges to support pricing decisions.
"""

business_objectives_comment = """
1. Support pricing decisions by predicting the most suitable price range for new phone models.
2. Reduce manual effort and time required to estimate price categories.
3. Improve product positioning in budget, mid-range, high-end, and flagship segments.
4. Increase transparency on how technical features influence pricing decisions.
"""

business_success_criteria_comment = """
1. The ML system is regularly used by product and pricing teams.
2. At least a 30% reduction in time needed for initial price-range estimation.
3. Most new models (>80%) remain in the initially selected price band after launch.
4. Pricing decisions become more consistent and data-driven across phone segments.
"""

data_mining_goals_comment = """
1. Build a multi-class classifier predicting the price_range (0–3) from 20 phone features.
2. Achieve robust accuracy on unseen data and generalize well to new configurations.
3. Identify important features (e.g., RAM, pixel resolution) influencing the price range.
4. Provide probability outputs to support uncertainty-aware pricing decisions.
"""

data_mining_success_criteria_comment = """
1. Achieve ≥90% accuracy on the validation/test set.
2. Macro F1-score ≥0.88 with no class having recall <0.80.
3. Model performance remains stable across different random splits.
4. Probabilities are reasonably calibrated for business decision use.
"""

ai_risk_aspects_comment = """
1. Misclassification may lead to wrong pricing decisions, affecting revenue or sales.
2. The dataset may not reflect future devices; model drift is a risk.
3. Over-reliance on the model could cause poor decisions without expert review.
4. Although no personal data is used, systematic bias across device categories is possible.
5. Pricing logic leakage is a business security risk if the model is exposed externally.
"""


bu_ass_uuid_executor = "79fd346c-c772-4e8c-92d8-583c5a1666ad" # Generate once
business_understanding_executor = [
f':business_understanding rdf:type prov:Activity .',
f':business_understanding sc:isPartOf :business_understanding_phase .', # Connect Activity to Parent Business Understanding Phase Activity
f':business_understanding prov:qualifiedAssociation :{bu_ass_uuid_executor} .',
f':{bu_ass_uuid_executor} prov:agent :{executed_by} .',
f':{bu_ass_uuid_executor} rdf:type prov:Association .',
f':{bu_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
]
engine.insert(business_understanding_executor, prefixes=prefixes)


business_understanding_data_executor = [
# 1a
f':bu_data_source_and_scenario rdf:type prov:Entity .',
f':bu_data_source_and_scenario prov:wasGeneratedBy :business_understanding .',
f':bu_data_source_and_scenario rdfs:label "1a Data Source and Scenario" .',
f':bu_data_source_and_scenario rdfs:comment """{data_src_and_scenario_comment}""" .',
# 1b
f':bu_business_objectives rdf:type prov:Entity .',
f':bu_business_objectives prov:wasGeneratedBy :business_understanding .',
f':bu_business_objectives rdfs:label "1b Business Objectives" .',
f':bu_business_objectives rdfs:comment """{business_objectives_comment}""" .',
# 1c
f':bu_business_success_criteria rdf:type prov:Entity .',
f':bu_business_success_criteria prov:wasGeneratedBy :business_understanding .',
f':bu_business_success_criteria rdfs:label "1c Business Success Criteria" .',
f':bu_business_success_criteria rdfs:comment """{business_success_criteria_comment}""" .',
# 1d
f':bu_data_mining_goals rdf:type prov:Entity .',
f':bu_data_mining_goals prov:wasGeneratedBy :business_understanding .',
f':bu_data_mining_goals rdfs:label "1d Data Mining Goals" .',
f':bu_data_mining_goals rdfs:comment """{data_mining_goals_comment}""" .',
# 1e
f':bu_data_mining_success_criteria rdf:type prov:Entity .',
f':bu_data_mining_success_criteria prov:wasGeneratedBy :business_understanding .',
f':bu_data_mining_success_criteria rdfs:label "1e Data Mining Success Criteria" .',
f':bu_data_mining_success_criteria rdfs:comment """{data_mining_success_criteria_comment}""" .',
# 1f
f':bu_ai_risk_aspects rdf:type prov:Entity .',
f':bu_ai_risk_aspects prov:wasGeneratedBy :business_understanding .',
f':bu_ai_risk_aspects rdfs:label "1f AI risk aspects" .',
f':bu_ai_risk_aspects rdfs:comment """{ai_risk_aspects_comment}""" .',

]
engine.insert(business_understanding_data_executor, prefixes=prefixes)

## Data Understanding

The following pseudo-code & pseudo-documentation may be used as a hint.

In [None]:
## Each Activity that follows is part of the Data Understanding Phase

business_understanding_phase_executor = [
f':data_understanding_phase rdf:type prov:Activity .',
f':data_understanding_phase rdfs:label "Data Understanding Phase" .', 
]
engine.insert(business_understanding_phase_executor, prefixes=prefixes)


In [None]:
data_path  = os.path.join("data", "datasets", "mobile_price")
load_mobile_data_code_writer = student_a
def load_mobile_data()-> pd.DataFrame:

    ### Load your data
    input_file = os.path.join(data_path, 'train.csv')
    df = pd.read_csv(input_file)
    return df

start_time_ld = now()
data = load_mobile_data()
end_time_ld = now()

display(data.head())

#############################################
# Documentation
#############################################

# Now document the raw data and the loaded data using appropriate ontologies.

# Always add these triples for every activity to define the executor!
ld_ass_uuid_executor = "ceb5c10f-749f-486b-b3c0-19ce82a2e393"  # Generate once
load_mobile_data_executor = [
    f':load_mobile_data prov:qualifiedAssociation :{ld_ass_uuid_executor} .',
    f':{ld_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{ld_ass_uuid_executor} rdf:type prov:Association .',
    f':{ld_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
]
engine.insert(load_mobile_data_executor, prefixes=prefixes)

ld_ass_uuid_writer = "b231c4fc-9670-4126-88aa-2b051acea6ac"  # Generate once
ld_report = """
Load the mobile price classification training data from train.csv and create
a pandas DataFrame with 2000 rows and 21 columns (20 features + 1 target price_range).
"""

load_mobile_data_activity = [
    # Activity itself
    ':load_mobile_data rdf:type prov:Activity .',
    ':load_mobile_data sc:isPartOf :data_understanding_phase .',
    ':load_mobile_data rdfs:comment "Data Understanding" .',
    f':load_mobile_data rdfs:comment """{ld_report}""" .',
    f':load_mobile_data prov:startedAtTime "{start_time_ld}"^^xsd:dateTime .',
    f':load_mobile_data prov:endedAtTime "{end_time_ld}"^^xsd:dateTime .',

    # Code writer association
    f':load_mobile_data prov:qualifiedAssociation :{ld_ass_uuid_writer} .',
    f':{ld_ass_uuid_writer} prov:agent :{load_mobile_data_code_writer} .',
    f':{ld_ass_uuid_writer} rdf:type prov:Association .',
    f':{ld_ass_uuid_writer} prov:hadRole :{code_writer_role} .',

    # INPUT of activity
    ':load_mobile_data prov:used :raw_data .',
    ':load_mobile_data prov:used :raw_data_path .',
    ':raw_data rdf:type prov:Entity .',
    ':raw_data_path rdf:type prov:Entity .',
    ':raw_data prov:wasDerivedFrom :raw_data_path .',

    # OUTPUT of activity
    ':data rdf:type prov:Entity .',
    ':data prov:wasGeneratedBy :load_mobile_data .',
    ':data prov:wasDerivedFrom :raw_data .',
]
engine.insert(load_mobile_data_activity, prefixes=prefixes)

# Further describe the raw data using Croissant
raw_data_triples = [
    # Dataset level
    ':raw_data rdf:type sc:Dataset .',
    ':raw_data sc:name "Mobile Price Classification dataset (raw)" .',
    ':raw_data sc:description "Kaggle mobile phone specifications with price_range labels (0–3) for 2000 phones." .',

    # File / distribution
    ':mobile_price_csv rdf:type cr:FileObject .',
    ':mobile_price_csv sc:name "train.csv" .',
    ':mobile_price_csv sc:encodingFormat "text/csv" .',
    ':raw_data sc:distribution :mobile_price_csv .',

    # RecordSet describing the tabular structure
    ':raw_recordset rdf:type cr:RecordSet .',
    ':raw_recordset sc:name "Table of mobile phone specifications and price range" .',
    ':raw_recordset cr:source :mobile_price_csv .',
    ':raw_data cr:recordSet :raw_recordset .',

    # === Fields: one entry per column in df.info() ===

    # 0 battery_power
    ':raw_recordset cr:field :field_battery_power .',
    ':field_battery_power rdf:type cr:Field .',
    ':field_battery_power sc:name "battery_power" .',
    ':field_battery_power sc:description "Total energy the battery can store in one charge (mAh)." .',
    ':field_battery_power cr:dataType xsd:integer .',

    # 1 blue
    ':raw_recordset cr:field :field_blue .',
    ':field_blue rdf:type cr:Field .',
    ':field_blue sc:name "blue" .',
    ':field_blue sc:description "Binary indicator (0/1) whether the phone has Bluetooth support." .',
    ':field_blue cr:dataType xsd:integer .',

    # 2 clock_speed
    ':raw_recordset cr:field :field_clock_speed .',
    ':field_clock_speed rdf:type cr:Field .',
    ':field_clock_speed sc:name "clock_speed" .',
    ':field_clock_speed sc:description "Maximum clock speed of the microprocessor (GHz)." .',
    ':field_clock_speed cr:dataType xsd:double .',

    # 3 dual_sim
    ':raw_recordset cr:field :field_dual_sim .',
    ':field_dual_sim rdf:type cr:Field .',
    ':field_dual_sim sc:name "dual_sim" .',
    ':field_dual_sim sc:description "Binary indicator (0/1) whether the phone supports dual SIM." .',
    ':field_dual_sim cr:dataType xsd:integer .',

    # 4 fc
    ':raw_recordset cr:field :field_fc .',
    ':field_fc rdf:type cr:Field .',
    ':field_fc sc:name "fc" .',
    ':field_fc sc:description "Front camera resolution in megapixels." .',
    ':field_fc cr:dataType xsd:integer .',

    # 5 four_g
    ':raw_recordset cr:field :field_four_g .',
    ':field_four_g rdf:type cr:Field .',
    ':field_four_g sc:name "four_g" .',
    ':field_four_g sc:description "Binary indicator (0/1) whether the phone supports 4G." .',
    ':field_four_g cr:dataType xsd:integer .',

    # 6 int_memory
    ':raw_recordset cr:field :field_int_memory .',
    ':field_int_memory rdf:type cr:Field .',
    ':field_int_memory sc:name "int_memory" .',
    ':field_int_memory sc:description "Internal memory size of the phone (in GB)." .',
    ':field_int_memory cr:dataType xsd:integer .',

    # 7 m_dep
    ':raw_recordset cr:field :field_m_dep .',
    ':field_m_dep rdf:type cr:Field .',
    ':field_m_dep sc:name "m_dep" .',
    ':field_m_dep sc:description "Mobile depth (thickness) in cm." .',
    ':field_m_dep cr:dataType xsd:double .',

    # 8 mobile_wt
    ':raw_recordset cr:field :field_mobile_wt .',
    ':field_mobile_wt rdf:type cr:Field .',
    ':field_mobile_wt sc:name "mobile_wt" .',
    ':field_mobile_wt sc:description "Weight of the mobile phone in grams." .',
    ':field_mobile_wt cr:dataType xsd:integer .',

    # 9 n_cores
    ':raw_recordset cr:field :field_n_cores .',
    ':field_n_cores rdf:type cr:Field .',
    ':field_n_cores sc:name "n_cores" .',
    ':field_n_cores sc:description "Number of cores of the processor (1–8)." .',
    ':field_n_cores cr:dataType xsd:integer .',

    # 10 pc
    ':raw_recordset cr:field :field_pc .',
    ':field_pc rdf:type cr:Field .',
    ':field_pc sc:name "pc" .',
    ':field_pc sc:description "Primary camera resolution in megapixels." .',
    ':field_pc cr:dataType xsd:integer .',

    # 11 px_height
    ':raw_recordset cr:field :field_px_height .',
    ':field_px_height rdf:type cr:Field .',
    ':field_px_height sc:name "px_height" .',
    ':field_px_height sc:description "Pixel resolution height of the mobile display." .',
    ':field_px_height cr:dataType xsd:integer .',

    # 12 px_width
    ':raw_recordset cr:field :field_px_width .',
    ':field_px_width rdf:type cr:Field .',
    ':field_px_width sc:name "px_width" .',
    ':field_px_width sc:description "Pixel resolution width of the mobile display." .',
    ':field_px_width cr:dataType xsd:integer .',

    # 13 ram
    ':raw_recordset cr:field :field_ram .',
    ':field_ram rdf:type cr:Field .',
    ':field_ram sc:name "ram" .',
    ':field_ram sc:description "Random Access Memory size in MB." .',
    ':field_ram cr:dataType xsd:integer .',

    # 14 sc_h
    ':raw_recordset cr:field :field_sc_h .',
    ':field_sc_h rdf:type cr:Field .',
    ':field_sc_h sc:name "sc_h" .',
    ':field_sc_h sc:description "Screen height of the mobile in cm." .',
    ':field_sc_h cr:dataType xsd:integer .',

    # 15 sc_w
    ':raw_recordset cr:field :field_sc_w .',
    ':field_sc_w rdf:type cr:Field .',
    ':field_sc_w sc:name "sc_w" .',
    ':field_sc_w sc:description "Screen width of the mobile in cm." .',
    ':field_sc_w cr:dataType xsd:integer .',

    # 16 talk_time
    ':raw_recordset cr:field :field_talk_time .',
    ':field_talk_time rdf:type cr:Field .',
    ':field_talk_time sc:name "talk_time" .',
    ':field_talk_time sc:description "Longest time that a single battery charge will last during continuous calls (hours)." .',
    ':field_talk_time cr:dataType xsd:integer .',

    # 17 three_g
    ':raw_recordset cr:field :field_three_g .',
    ':field_three_g rdf:type cr:Field .',
    ':field_three_g sc:name "three_g" .',
    ':field_three_g sc:description "Binary indicator (0/1) whether the phone supports 3G." .',
    ':field_three_g cr:dataType xsd:integer .',

    # 18 touch_screen
    ':raw_recordset cr:field :field_touch_screen .',
    ':field_touch_screen rdf:type cr:Field .',
    ':field_touch_screen sc:name "touch_screen" .',
    ':field_touch_screen sc:description "Binary indicator (0/1) whether the phone has a touch screen." .',
    ':field_touch_screen cr:dataType xsd:integer .',

    # 19 wifi
    ':raw_recordset cr:field :field_wifi .',
    ':field_wifi rdf:type cr:Field .',
    ':field_wifi sc:name "wifi" .',
    ':field_wifi sc:description "Binary indicator (0/1) whether the phone supports WiFi." .',
    ':field_wifi cr:dataType xsd:integer .',

    # 20 price_range (target)
    ':raw_recordset cr:field :field_price_range .',
    ':field_price_range rdf:type cr:Field .',
    ':field_price_range sc:name "price_range" .',
    ':field_price_range sc:description "Target: price category of the mobile (0=low, 1=medium, 2=high, 3=very high)." .',
    ':field_price_range cr:dataType xsd:integer .',
]
engine.insert(raw_data_triples, prefixes=prefixes)

# Also the output of the load activity is a dataset that can be described with Croissant
data_triples = [
    ':data rdf:type sc:Dataset .',
    ':data sc:name "Loaded mobile price classification data" .',
    ':data sc:description "In-memory pandas DataFrame with 2000 rows and 21 columns (20 features + 1 target price_range)." .',

    ':recordset rdf:type cr:RecordSet .',
    ':recordset sc:name "Mobile price DataFrame recordset" .',
    ':data cr:recordSet :recordset .',

    # Reuse the same Field individuals for the loaded data
    ':recordset cr:field :field_battery_power .',
    ':recordset cr:field :field_blue .',
    ':recordset cr:field :field_clock_speed .',
    ':recordset cr:field :field_dual_sim .',
    ':recordset cr:field :field_fc .',
    ':recordset cr:field :field_four_g .',
    ':recordset cr:field :field_int_memory .',
    ':recordset cr:field :field_m_dep .',
    ':recordset cr:field :field_mobile_wt .',
    ':recordset cr:field :field_n_cores .',
    ':recordset cr:field :field_pc .',
    ':recordset cr:field :field_px_height .',
    ':recordset cr:field :field_px_width .',
    ':recordset cr:field :field_ram .',
    ':recordset cr:field :field_sc_h .',
    ':recordset cr:field :field_sc_w .',
    ':recordset cr:field :field_talk_time .',
    ':recordset cr:field :field_three_g .',
    ':recordset cr:field :field_touch_screen .',
    ':recordset cr:field :field_wifi .',
    ':recordset cr:field :field_price_range .',
]
engine.insert(data_triples, prefixes=prefixes)

# Also add the units to some fields (example usage of QUDT/SI units)
units_triples = [
    # Battery power in mAh – treated as a kind of counting/energy-related unit
    ':field_battery_power qudt:unit qudt:CountingUnit .',

    # RAM and internal memory – information capacity
    ':field_ram qudt:unit qudt:InformationUnit .',
    ':field_int_memory qudt:unit qudt:InformationUnit .',

    # Screen dimensions and depth – lengths in cm
    ':field_sc_h qudt:unit siu:centiMeter .',
    ':field_sc_w qudt:unit siu:centiMeter .',
    ':field_m_dep qudt:unit siu:centiMeter .',

    # Weight in grams
    ':field_mobile_wt qudt:unit siu:gram .',

    # Talk time in hours
    ':field_talk_time qudt:unit siu:hour .',

    # Pixel resolution – we treat as a counting unit
    ':field_px_height qudt:unit qudt:CountingUnit .',
    ':field_px_width qudt:unit qudt:CountingUnit .',

    # Camera megapixels – also counting-like
    ':field_fc qudt:unit qudt:CountingUnit .',
    ':field_pc qudt:unit qudt:CountingUnit .',

    # Number of cores – plain count
    ':field_n_cores qudt:unit qudt:CountingUnit .',

    # Binary flags and class label – counts / dimensionless
    ':field_blue qudt:unit qudt:DimensionlessUnit .',
    ':field_dual_sim qudt:unit qudt:DimensionlessUnit .',
    ':field_four_g qudt:unit qudt:DimensionlessUnit .',
    ':field_three_g qudt:unit qudt:DimensionlessUnit .',
    ':field_touch_screen qudt:unit qudt:DimensionlessUnit .',
    ':field_wifi qudt:unit qudt:DimensionlessUnit .',
    ':field_price_range qudt:unit qudt:DimensionlessUnit .',
]
engine.insert(units_triples, prefixes=prefixes)

**Continue with other tasks of the Data Understanding phase such as checking the distribution, skewness, plausibility of values, etc...**

In [None]:
#############################################
# Data Understanding – Summary of Variables
#############################################

du_summary_code_writer = student_a

# --- Compute categorical and numerical summaries ---
start_time_du = now()

df_categorical = data[['price_range', 'n_cores', 'blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']].astype(str)
df_numerical = data.drop(columns=df_categorical.columns)

categorical_summary = pd.DataFrame({
    'Number of Unique Values': df_categorical.nunique(),
    'Unique Values': df_categorical.apply(lambda x: x.unique())
})

numerical_summary = df_numerical.describe().T.round(2)

end_time_du = now()

display(categorical_summary)
display(numerical_summary)

#############################################
# Provenance Documentation
#############################################

du_ass_uuid_executor = "75572bcb-4210-499f-9b34-783657b43f2c"

du_summary_executor = [
    f':du_summary prov:qualifiedAssociation :{du_ass_uuid_executor} .',
    f':{du_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{du_ass_uuid_executor} rdf:type prov:Association .',
    f':{du_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
]
engine.insert(du_summary_executor, prefixes=prefixes)

du_ass_uuid_writer = "031f9295-0680-462e-8e37-4e096b5512f2"

du_summary_comment = """
We examined the dataset structure by separating categorical and numerical variables.
Categorical features (blue, dual_sim, three_g, four_g, etc.) show mostly binary distributions.
The target price_range is evenly distributed across its four classes (0-3), confirming a balanced dataset.
Numerical variables (battery_power, ram, px_width, px_height, etc.) show plausible ranges and no missing values.
This summary provides an initial understanding of feature types and their variability.
"""

du_summary_activity = [
    ':du_summary rdf:type prov:Activity .',
    ':du_summary sc:isPartOf :data_understanding_phase .',
    f':du_summary rdfs:comment """{du_summary_comment}""" .',
    f':du_summary prov:startedAtTime "{start_time_du}"^^xsd:dateTime .',
    f':du_summary prov:endedAtTime "{end_time_du}"^^xsd:dateTime .',
    f':du_summary prov:qualifiedAssociation :{du_ass_uuid_writer} .',
    f':{du_ass_uuid_writer} prov:agent :{du_summary_code_writer} .',
    f':{du_ass_uuid_writer} rdf:type prov:Association .',
    f':{du_ass_uuid_writer} prov:hadRole :{code_writer_role} .',

    # Outputs
    ':categorical_summary rdf:type prov:Entity .',
    f':categorical_summary rdfs:comment """{categorical_summary.to_string()}""" .',
    ':categorical_summary prov:wasGeneratedBy :du_summary .',

    ':numerical_summary rdf:type prov:Entity .',
    f':numerical_summary rdfs:comment """{numerical_summary.to_string()}""" .',
    ':numerical_summary prov:wasGeneratedBy :du_summary .',
]
engine.insert(du_summary_activity, prefixes=prefixes)


In [None]:
pip install --upgrade nbformat

In [None]:
#############################################
# Data Understanding – Categorical Distributions
#############################################

from plotly.subplots import make_subplots
import plotly.graph_objects as go
from matplotlib import colors

catdist_code_writer = student_a
start_time_catdist = now()

# --- Create Figure ---
fig = make_subplots(rows=3, cols=3, specs=[[{'type':'domain'}]*3]*3,
                    vertical_spacing=0.05, horizontal_spacing=0.01)

for i, feature in enumerate(df_categorical.columns):
    value_counts = df_categorical[feature].value_counts()
    labels = value_counts.index.tolist()
    values = value_counts.values.tolist()
  
    cmap = colors.LinearSegmentedColormap.from_list("purple_contrast", ["#6A0DAD", "white"])
    
    norm = colors.Normalize(vmin=0, vmax=len(labels))
    color_list = [colors.rgb2hex(cmap(norm(i))) for i in range(len(labels))]

    pie_chart = go.Pie(labels=labels, values=values,
                       marker=dict(colors=color_list, line=dict(color='white', width=2)),
                       textinfo='percent+label', title=feature)

    if i < 8:
        fig.add_trace(pie_chart, row=i//3 + 1, col=i%3 + 1)

fig.update_layout(title="Distribution of Categorical Variables", height=900, width=900, showlegend=False)
fig.show()

end_time_catdist = now()

#############################################
# Provenance Documentation
#############################################

catdist_ass_uuid_executor = "f9fb94db-c9c1-42b0-995b-6addac491af3"

catdist_executor = [
    f':categorical_distribution prov:qualifiedAssociation :{catdist_ass_uuid_executor} .',
    f':{catdist_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{catdist_ass_uuid_executor} rdf:type prov:Association .',
    f':{catdist_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
]
engine.insert(catdist_executor, prefixes=prefixes)

catdist_ass_uuid_writer = "a2007472-d950-437f-8608-9e6ee79b8816"

catdist_comment = """
Plotted pie-chart distributions for all categorical variables.
The price_range classes are evenly distributed (500 samples each).
Binary features such as blue, dual_sim, four_g, three_g, wifi, and touch_screen
display expected near-even splits, except three_g which shows ~76% phones supporting 3G.
These visualizations help assess class balance and detect unusual category frequencies.
"""

catdist_activity = [
    ':categorical_distribution rdf:type prov:Activity .',
    ':categorical_distribution sc:isPartOf :data_understanding_phase .',
    f':categorical_distribution rdfs:comment """{catdist_comment}""" .',
    f':categorical_distribution prov:startedAtTime "{start_time_catdist}"^^xsd:dateTime .',
    f':categorical_distribution prov:endedAtTime "{end_time_catdist}"^^xsd:dateTime .',
    f':categorical_distribution prov:qualifiedAssociation :{catdist_ass_uuid_writer} .',
    f':{catdist_ass_uuid_writer} prov:agent :{catdist_code_writer} .',
    f':{catdist_ass_uuid_writer} rdf:type prov:Association .',
    f':{catdist_ass_uuid_writer} prov:hadRole :{code_writer_role} .',

    # Output entity
    ':categorical_distribution_output rdf:type prov:Entity .',
    ':categorical_distribution_output rdfs:comment "Plot showing categorical variable distribution." .',
    ':categorical_distribution_output prov:wasGeneratedBy :categorical_distribution .'
]
engine.insert(catdist_activity, prefixes=prefixes)


In [None]:
import seaborn as sns

In [None]:
#############################################
# Data Understanding – Numerical Distributions & Skewness
#############################################

numdist_code_writer = student_a
start_time_numdist = now()

# Create custom colormap
cmap = colors.LinearSegmentedColormap.from_list("purple_contrast", ["#6A0DAD", "white"])

# A helper function to generate consistent purple shades
def get_color(index, total=20):
    norm = colors.Normalize(vmin=0, vmax=total)
    return colors.rgb2hex(cmap(norm(index)))

fig, ax = plt.subplots(nrows=5, ncols=3, figsize=(15,22))

skewness_report = {}

for i, col in enumerate(df_numerical.columns):
    x = i // 3
    y = i % 3

    # Precompute histogram bin ranges
    values, bins = np.histogram(
        df_numerical[col],
        range=(np.floor(df_numerical[col].min()), np.ceil(df_numerical[col].max()))
    )

    # Plot histogram
    graph = sns.histplot(
        df_numerical[col],
        bins=bins,
        kde=True,
        ax=ax[x, y],
        color=get_color(i),
        alpha=0.7,
        edgecolor='white',
        line_kws={'lw': 3, 'color': '#6A0DAD'}
    )
    
    # Add count labels on top of the bars
    for container in graph.containers:
        ax[x, y].bar_label(container, fmt='%d', padding=3, fontsize=8) # fmt='%d' ensures integer counts

    ax[x, y].set_xlabel(col, fontsize=13)
    ax[x, y].set_ylabel('Count', fontsize=11)
    ax[x, y].grid(color='lightgrey')

    # Add skewness to report
    skewness_report[col] = float(df_numerical[col].skew())

# Remove unused axes
ax[4, 1].axis('off')
ax[4, 2].axis('off')

plt.suptitle('Distribution of Numerical Variables', fontsize=20)
plt.tight_layout()
plt.subplots_adjust(top=0.95)
plt.show()

end_time_numdist = now()

#############################################
# Provenance Documentation
#############################################

numdist_ass_uuid_executor = "b3893fea-29a0-4efb-b0b1-455c93627c9c"

numdist_executor = [
    f':numerical_distribution prov:qualifiedAssociation :{numdist_ass_uuid_executor} .',
    f':{numdist_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{numdist_ass_uuid_executor} rdf:type prov:Association .',
    f':{numdist_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
]
engine.insert(numdist_executor, prefixes=prefixes)

numdist_ass_uuid_writer = "9cfdac02-de31-4588-819b-76f4ae6a6a5b"

numdist_comment = """
Histogram analysis shows the distribution of all numerical variables.
Most features appear unimodal with realistic ranges. Some variables such as px_height
and sc_w contain many values near zero, indicating potential noise.
Skewness was computed to identify asymmetry and assess preprocessing needs.
"""

numdist_activity = [
    ':numerical_distribution rdf:type prov:Activity .',
    ':numerical_distribution sc:isPartOf :data_understanding_phase .',
    f':numerical_distribution rdfs:comment """{numdist_comment}""" .',
    f':numerical_distribution prov:startedAtTime "{start_time_numdist}"^^xsd:dateTime .',
    f':numerical_distribution prov:endedAtTime "{end_time_numdist}"^^xsd:dateTime .',
    f':numerical_distribution prov:qualifiedAssociation :{numdist_ass_uuid_writer} .',
    f':{numdist_ass_uuid_writer} prov:agent :{numdist_code_writer} .',
    f':{numdist_ass_uuid_writer} rdf:type prov:Association .',
    f':{numdist_ass_uuid_writer} prov:hadRole :{code_writer_role} .',

    # Output entity with skewness report
    ':numerical_skewness_report rdf:type prov:Entity .',
    f':numerical_skewness_report rdfs:comment """{json.dumps(skewness_report, indent=2)}""" .',
    ':numerical_skewness_report prov:wasGeneratedBy :numerical_distribution .',
]
engine.insert(numdist_activity, prefixes=prefixes)


In [None]:
#############################################
# Data Understanding – Outlier Detection (Z-score)
#############################################

outlier_code_writer = student_a

def detect_outliers(df: pd.DataFrame, threshold=3.0):
    """
    Detect outliers in all numeric columns using a z-score threshold.
    Returns a dict with count and indices per column.
    """
    results = {}
    df_num = df.select_dtypes(include=['int64', 'float64'])

    for col in df_num.columns:
        values = df_num[col].astype(float)
        mean = values.mean()
        std = values.std()

        if std == 0 or np.isnan(std):
            results[col] = {"count": 0, "indices": []}
            continue

        z_scores = (values - mean) / std
        mask = np.abs(z_scores) > threshold
        outlier_indices = list(values[mask].index)

        results[col] = {
            "count": len(outlier_indices),
            "indices": outlier_indices
        }

    return results

start_time_out = now()
outlier_report = detect_outliers(data, threshold=3.0)
end_time_out = now()

print("Outlier Detection Report:")
print(json.dumps(outlier_report, indent=2))

#############################################
# Provenance Documentation
#############################################

out_ass_uuid_executor = "0194402e-c08c-4d6f-bf29-3dd0fd549aff"

outlier_executor = [
    f':outlier_detection prov:qualifiedAssociation :{out_ass_uuid_executor} .',
    f':{out_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{out_ass_uuid_executor} rdf:type prov:Association .',
    f':{out_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
]
engine.insert(outlier_executor, prefixes=prefixes)

out_ass_uuid_writer = "f70ae005-5427-4857-8152-1f9bec395815"

# Short human-readable summary tailored to your actual result
outlier_summary_text = """
Outlier detection was performed using a z-score threshold of 3.0 on all numeric variables.
The resulting report shows that no outliers were detected for most features; only 'fc'
(front camera megapixels) has 12 observations flagged as potential outliers.
All other variables have count = 0 outliers. This suggests the dataset is generally clean,
with a small number of unusually high front camera values that can be considered in the
data preparation phase.
"""

outlier_activity = [
    ':outlier_detection rdf:type prov:Activity .',
    ':outlier_detection sc:isPartOf :data_understanding_phase .',
    f':outlier_detection rdfs:comment """{outlier_summary_text}""" .',
    f':outlier_detection prov:startedAtTime "{start_time_out}"^^xsd:dateTime .',
    f':outlier_detection prov:endedAtTime "{end_time_out}"^^xsd:dateTime .',
    f':outlier_detection prov:qualifiedAssociation :{out_ass_uuid_writer} .',
    f':{out_ass_uuid_writer} prov:agent :{outlier_code_writer} .',
    f':{out_ass_uuid_writer} rdf:type prov:Association .',
    f':{out_ass_uuid_writer} prov:hadRole :{code_writer_role} .',

    # JSON report as provenance entity
    ':outlier_report rdf:type prov:Entity .',
    f':outlier_report rdfs:comment """{json.dumps(outlier_report, indent=2)}""" .',
    ':outlier_report prov:wasGeneratedBy :outlier_detection .',
]
engine.insert(outlier_activity, prefixes=prefixes)


In [None]:
#############################################
# Data Understanding – Correlation Analysis
#############################################

corr_code_writer = student_a
start_time_corr = now()

# Compute correlation matrix
corr_matrix = data.corr(numeric_only=True)

# Create a mask for the upper triangle (to show only the lower triangle)
# mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)

# Create purple→white colormap
# cmap = colors.LinearSegmentedColormap.from_list("purple_contrast", ["#6A0DAD", "white"])
cmap = colors.LinearSegmentedColormap.from_list("purple_contrast_reversed", ["white", "#6A0DAD"])

plt.figure(figsize=(14,10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", annot_kws={"size": 8}, mask=mask, cmap=cmap, linewidths=0.5)
plt.title("Correlation Heatmap of Numerical Variables", fontsize=18)
plt.show()

end_time_corr = now()

#############################################
# Provenance Documentation
#############################################

corr_ass_uuid_executor = "0a0709a9-ecaa-4ee3-94c1-2b37b96e1852"

corr_executor = [
    f':correlation_analysis prov:qualifiedAssociation :{corr_ass_uuid_executor} .',
    f':{corr_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{corr_ass_uuid_executor} rdf:type prov:Association .',
    f':{corr_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
]
engine.insert(corr_executor, prefixes=prefixes)

corr_ass_uuid_writer = "e57283ec-40ef-4c65-8310-87172f233cf3"

corr_comment = """
A correlation heatmap was computed for all numeric variables including the target price_range.
Most off-diagonal entries are dark, indicating generally weak linear correlations between
features. A few feature pairs (e.g. front vs. primary camera, screen height vs. screen width,
and pixel height vs. pixel width) show slightly higher positive correlations, but no very strong
multicollinearity is visible. Correlation between individual features and price_range appears
moderate at most. Overall, the heatmap suggests that features provide complementary
information without severe redundancy.
"""

corr_activity = [
    ':correlation_analysis rdf:type prov:Activity .',
    ':correlation_analysis sc:isPartOf :data_understanding_phase .',
    f':correlation_analysis rdfs:comment """{corr_comment}""" .',
    f':correlation_analysis prov:startedAtTime "{start_time_corr}"^^xsd:dateTime .',
    f':correlation_analysis prov:endedAtTime "{end_time_corr}"^^xsd:dateTime .',
    f':correlation_analysis prov:qualifiedAssociation :{corr_ass_uuid_writer} .',
    f':{corr_ass_uuid_writer} prov:agent :{corr_code_writer} .',
    f':{corr_ass_uuid_writer} rdf:type prov:Association .',
    f':{corr_ass_uuid_writer} prov:hadRole :{code_writer_role} .',

    ':corr_matrix rdf:type prov:Entity .',
    f':corr_matrix rdfs:comment """{corr_matrix.to_string()}""" .',
    ':corr_matrix prov:wasGeneratedBy :correlation_analysis .',
]
engine.insert(corr_activity, prefixes=prefixes)


In [None]:
#############################################
# Data Understanding – Data Quality Assessment
#############################################

dqa_code_writer = student_a
start_time_dqa = now()

# Programmatic checks
missing_values = data.isnull().sum().to_dict()
duplicate_count = int(data.duplicated().sum())

quality_report = {
    "missing_values": missing_values,
    "duplicate_rows": duplicate_count,
    "data_types": data.dtypes.astype(str).to_dict(),
    "value_range_issues": {
        "px_height_zero_count": int((data["px_height"] == 0).sum()),
        "sc_w_zero_count": int((data["sc_w"] == 0).sum())
    },
    "remarks": (
        "The dataset is complete: all 21 columns have 0 missing values and there are no "
        "duplicate rows. Numerical data types are consistent with the feature semantics. "
        "However, px_height has 2 zero values and sc_w has 180 zero values. These zeros are "
        "unlikely for real screens and may represent noisy or atypical records that should be "
        "handled carefully in the data preparation phase."
    )
}

print(json.dumps(quality_report, indent=2))

end_time_dqa = now()

#############################################
# Provenance Documentation
#############################################

dqa_ass_uuid_executor = "27c4dc86-7147-45a9-b17e-ade53867c2e8"

dqa_executor = [
    f':data_quality_assessment prov:qualifiedAssociation :{dqa_ass_uuid_executor} .',
    f':{dqa_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{dqa_ass_uuid_executor} rdf:type prov:Association .',
    f':{dqa_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
]
engine.insert(dqa_executor, prefixes=prefixes)

dqa_ass_uuid_writer = "5d037fa0-89ed-4ecd-b69b-975d17884658"

dqa_comment = """
Data quality checks show that the dataset is internally very clean: there are no missing
values in any of the 21 columns and no duplicate rows. All features have numeric dtypes
consistent with their intended semantics (integers or floats).

A potential issue is that px_height contains 2 zero values and sc_w contains 180 zero values.
Zero screen height or width is implausible for real mobile devices and may indicate noisy,
special, or incorrectly recorded cases. These findings should be considered in the data
preparation phase (e.g., deciding whether to filter, impute, or keep these records).
"""

dqa_activity = [
    ':data_quality_assessment rdf:type prov:Activity .',
    ':data_quality_assessment sc:isPartOf :data_understanding_phase .',
    f':data_quality_assessment rdfs:comment """{dqa_comment}""" .',
    f':data_quality_assessment prov:startedAtTime "{start_time_dqa}"^^xsd:dateTime .',
    f':data_quality_assessment prov:endedAtTime "{end_time_dqa}"^^xsd:dateTime .',
    f':data_quality_assessment prov:qualifiedAssociation :{dqa_ass_uuid_writer} .',
    f':{dqa_ass_uuid_writer} prov:agent :{dqa_code_writer} .',
    f':{dqa_ass_uuid_writer} rdf:type prov:Association .',
    f':{dqa_ass_uuid_writer} prov:hadRole :{code_writer_role} .',

    ':data_quality_report rdf:type prov:Entity .',
    f':data_quality_report rdfs:comment """{json.dumps(quality_report, indent=2)}""" .',
    ':data_quality_report prov:wasGeneratedBy :data_quality_assessment .',
]
engine.insert(dqa_activity, prefixes=prefixes)



In [None]:
#############################################
# Data Understanding – 2e Ethical / Bias Aspects
#############################################

du_bias_code_writer = student_a
start_time_du_bias = now()
end_time_du_bias = now()

du_bias_ass_uuid_executor = "a8c4c0ba-0e4a-4b05-9bf7-0c7d7b3e1a10"

du_bias_executor = [
    ':du_bias_attributes rdf:type prov:Activity .',
    ':du_bias_attributes sc:isPartOf :data_understanding_phase .',
    f':du_bias_attributes prov:qualifiedAssociation :{du_bias_ass_uuid_executor} .',
    f':{du_bias_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{du_bias_ass_uuid_executor} rdf:type prov:Association .',
    f':{du_bias_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
    f':du_bias_attributes prov:startedAtTime "{start_time_du_bias}"^^xsd:dateTime .',
    f':du_bias_attributes prov:endedAtTime "{end_time_du_bias}"^^xsd:dateTime .',
]
engine.insert(du_bias_executor, prefixes=prefixes)

du_bias_ass_uuid_writer = "039cc929-8117-44a3-abcc-f498b4d8d832"

du_bias_comment = """
The dataset contains no personal or sensitive attributes—only technical phone specifications.
The target price_range is perfectly balanced (500 samples per class).
Some device types are less frequent (e.g., rare feature combinations), but this affects model
performance rather than human fairness. No ethical or demographic bias risks are present.
"""

du_bias_activity = [
    ':du_bias_attributes_summary rdf:type prov:Entity .',
    ':du_bias_attributes_summary prov:wasGeneratedBy :du_bias_attributes .',
    ':du_bias_attributes_summary rdfs:label "2e Ethical / Bias Aspects" .',
    f':du_bias_attributes_summary rdfs:comment """{du_bias_comment}""" .',
]
engine.insert(du_bias_activity, prefixes=prefixes)


In [None]:
#############################################
# Data Understanding – 2f Risks & Expert Questions
#############################################

du_risk_code_writer = student_a
start_time_du_risk = now()
end_time_du_risk = now()

du_risk_ass_uuid_executor = "1907fabb-d64e-4b65-9a51-b0232c663d03"

du_risk_executor = [
    ':du_risk_analysis rdf:type prov:Activity .',
    ':du_risk_analysis sc:isPartOf :data_understanding_phase .',
    f':du_risk_analysis prov:qualifiedAssociation :{du_risk_ass_uuid_executor} .',
    f':{du_risk_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{du_risk_ass_uuid_executor} rdf:type prov:Association .',
    f':{du_risk_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
    f':du_risk_analysis prov:startedAtTime "{start_time_du_risk}"^^xsd:dateTime .',
    f':du_risk_analysis prov:endedAtTime "{end_time_du_risk}"^^xsd:dateTime .',
]
engine.insert(du_risk_executor, prefixes=prefixes)

du_risk_ass_uuid_writer = "31229430-b424-43e5-8a2c-39699767bebf"

du_risk_comment = """
Potential risks include limited representativeness of devices in the dataset, since some 
feature combinations (e.g., very high camera MP or zero-sized screen dimensions) may not 
reflect real market distribution. Such records may introduce noise or skew model behavior. 
There is also a risk that the dataset does not capture newer technologies, causing future 
model drift.

Questions for an external domain expert include:
• Are zero values for px_height or sc_w technically valid or measurement artifacts?
• Are unusually high fc values real device specifications or outliers?
• Does the dataset represent a realistic mix of budget, mid-range, and high-end devices?
• Are any important device features missing that strongly influence real-world pricing?
"""

du_risk_activity = [
    ':du_risk_analysis_summary rdf:type prov:Entity .',
    ':du_risk_analysis_summary prov:wasGeneratedBy :du_risk_analysis .',
    ':du_risk_analysis_summary rdfs:label "2f Risks & Expert Questions" .',
    f':du_risk_analysis_summary rdfs:comment """{du_risk_comment}""" .',
]
engine.insert(du_risk_activity, prefixes=prefixes)


In [None]:
#############################################
# Data Understanding – 2g Actions Required for Data Preparation
#############################################

du_actions_code_writer = student_a
start_time_du_actions = now()
end_time_du_actions = now()

du_actions_ass_uuid_executor = "be62c375-e85e-4104-9863-6a700774143b"

du_actions_executor = [
    ':du_preparation_actions rdf:type prov:Activity .',
    ':du_preparation_actions sc:isPartOf :data_understanding_phase .',
    f':du_preparation_actions prov:qualifiedAssociation :{du_actions_ass_uuid_executor} .',
    f':{du_actions_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{du_actions_ass_uuid_executor} rdf:type prov:Association .',
    f':{du_actions_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
    f':du_preparation_actions prov:startedAtTime "{start_time_du_actions}"^^xsd:dateTime .',
    f':du_preparation_actions prov:endedAtTime "{end_time_du_actions}"^^xsd:dateTime .',
]
engine.insert(du_actions_executor, prefixes=prefixes)

du_actions_ass_uuid_writer = "b7b77ece-d7e1-4be6-aeae-590c26c65d3a"

du_actions_comment = """
Based on the data understanding analysis, the following actions are recommended for the
data preparation phase:

- Handle zero values in px_height (2 cases) and sc_w (180 cases), as such values are 
  unlikely for real devices; consider filtering or imputing them.

- Evaluate whether to treat the 12 outliers in fc (front camera MP) as noise or keep them,
  depending on domain expert clarification.

- Standardize or scale numerical features (e.g., RAM, battery_power, pixel resolution),
  since they vary across different ranges and may affect model performance.

- Convert categorical binary features (e.g., blue, dual_sim, four_g, three_g, wifi) to 
  consistent types if needed, although no encoding is required because they are already numeric.

- Review potential skewness in some numerical variables and apply transformations if needed 
  for algorithms sensitive to non-normality.

- Ensure proper train–validation–test splitting to maintain the balanced distribution 
  of price_range classes.
"""

du_actions_activity = [
    ':du_preparation_actions_summary rdf:type prov:Entity .',
    ':du_preparation_actions_summary prov:wasGeneratedBy :du_preparation_actions .',
    ':du_preparation_actions_summary rdfs:label "2g Actions Required for Data Preparation" .',
    f':du_preparation_actions_summary rdfs:comment """{du_actions_comment}""" .',
]
engine.insert(du_actions_activity, prefixes=prefixes)


## Data Preparation

In [None]:
## Each Activity that follows is part of the Data Preparation Phase

data_preparation_phase_executor = [
f':data_preparation_phase rdf:type prov:Activity .',
f':data_preparation_phase rdfs:label "Data Preparation Phase" .', 
]
engine.insert(data_preparation_phase_executor, prefixes=prefixes)

**Continue with other tasks of the Data Preparation phase such as binning, scaling etc...**

In [None]:
#############################################
# Data Preparation – 3a Preprocessing Actions
# Duplicate check, missing value check, noise tagging
#############################################

prep_basic_code_writer = student_b

def preprocess_basic(df: pd.DataFrame):
    """
    Perform initial preprocessing steps based on Data Understanding:
    - Detect duplicates.
    - Detect missing values.
    - Detect noise-like values for px_height and sc_w (tag only, do not remove).
    """
    result = {}

    # Duplicate detection
    result["duplicate_count"] = int(df.duplicated().sum())

    # Missing values
    result["missing_values"] = df.isnull().sum().to_dict()

    # Noise detection (according to chosen thresholds)
    noise_sc_w = df[df["sc_w"] < 2].index.tolist()
    noise_px_height = df[df["px_height"] < 5].index.tolist()

    result["noise_sc_w_count"] = len(noise_sc_w)
    result["noise_px_height_count"] = len(noise_px_height)
    result["noise_sc_w_indices"] = noise_sc_w
    result["noise_px_height_indices"] = noise_px_height

    return result


start_time_prep_basic = now()
prep_basic_report = preprocess_basic(data)
end_time_prep_basic = now()

print(json.dumps(prep_basic_report, indent=2))

#############################################
# Provenance Documentation
#############################################

prep_basic_ass_uuid_executor = "d6e9783f-f036-41c8-8076-400664a30721"

prep_basic_executor = [
    f':prep_basic prov:qualifiedAssociation :{prep_basic_ass_uuid_executor} .',
    f':{prep_basic_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{prep_basic_ass_uuid_executor} rdf:type prov:Association .',
    f':{prep_basic_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
]
engine.insert(prep_basic_executor, prefixes=prefixes)

prep_basic_ass_uuid_writer = "16e9ce73-e036-4e57-a6be-11d35b0cc868"

prep_basic_comment = """
Initial preprocessing actions based on the Data Understanding phase:
• duplicate_count = 0 → no duplicate rows in the dataset.
• All 21 attributes have 0 missing values.
• Noise-like values: sc_w < 2 occurs in 390 rows; px_height < 5 occurs in 9 rows.
  These cases were identified and recorded but not removed at this stage, because
  their interpretation is unclear. We decided to keep them as some models such as SVM are relatively
  robust to such noise. Final handling can be decided later based on model behavior.
"""

prep_basic_activity = [
    ':prep_basic rdf:type prov:Activity .',
    ':prep_basic sc:isPartOf :data_preparation_phase .',
    f':prep_basic rdfs:comment """{prep_basic_comment}""" .',
    f':prep_basic prov:startedAtTime "{start_time_prep_basic}"^^xsd:dateTime .',
    f':prep_basic prov:endedAtTime "{end_time_prep_basic}"^^xsd:dateTime .',
    f':prep_basic prov:qualifiedAssociation :{prep_basic_ass_uuid_writer} .',
    f':{prep_basic_ass_uuid_writer} prov:agent :{prep_basic_code_writer} .',
    f':{prep_basic_ass_uuid_writer} rdf:type prov:Association .',
    f':{prep_basic_ass_uuid_writer} prov:hadRole :{code_writer_role} .',

    # Output entity
    ':prep_basic_report rdf:type prov:Entity .',
    f':prep_basic_report rdfs:comment """{json.dumps(prep_basic_report, indent=2)}""" .',
    ':prep_basic_report prov:wasGeneratedBy :prep_basic .',
]
engine.insert(prep_basic_activity, prefixes=prefixes)


In [None]:
#############################################
# Data Preparation – 3b Steps Considered but Not Applied
#############################################

prep_considered_code_writer = student_b
start_time_prep_considered = now()
end_time_prep_considered = now()

prep_considered_ass_uuid_executor = "5edaccd8-5083-4757-ad7a-f79c0ae532af"

prep_considered_executor = [
    ':prep_considered rdf:type prov:Activity .',
    ':prep_considered sc:isPartOf :data_preparation_phase .',
    f':prep_considered prov:qualifiedAssociation :{prep_considered_ass_uuid_executor} .',
    f':{prep_considered_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{prep_considered_ass_uuid_executor} rdf:type prov:Association .',
    f':{prep_considered_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
    f':prep_considered prov:startedAtTime "{start_time_prep_considered}"^^xsd:dateTime .',
    f':prep_considered prov:endedAtTime "{end_time_prep_considered}"^^xsd:dateTime .',
]
engine.insert(prep_considered_executor, prefixes=prefixes)

prep_considered_ass_uuid_writer = "740fd7bf-4a32-4b46-8f9b-f05a17bb81a7"

prep_considered_comment = """
Preprocessing steps that were considered during the Data Preparation phase but not applied:

• Outlier removal:
  Outliers in the front-camera field (fc) were identified earlier. Since their domain
  validity is uncertain and they may represent legitimate device variations, they were
  not removed at this stage. Their impact will be assessed during modeling.

• Noise cleaning:
  Values identified as noise-like (sc_w < 2, px_height < 5) were retained for now.
  These may correspond to early or atypical devices. Noise handling will be reconsidered
  if models show sensitivity to these records.

• Feature removal based on low correlation:
  Pearson correlation was already evaluated in the Data Understanding phase. Although
  some features showed weak linear correlation with the target, correlation alone is not
  a reliable criterion for removal, especially when non-linear relationships may exist.
  Therefore, no features were dropped solely based on correlation, and model-based
  feature importance techniques will be used instead.

• Scaling and normalization:
  Scaling was considered but postponed. Only algorithms sensitive to feature magnitudes
  (e.g., SVM) will require scaling, and it will be applied at the model-specific stage (if required).

• Additional encoding:
  Binary technical attributes (blue, dual_sim, three_g, four_g, touch_screen, wifi) 
  are already coded as 0/1. We considered one-hot encoding, but since these are 
  simple binary flags, no further encoding was applied.
"""

prep_considered_activity = [
    ':prep_considered_summary rdf:type prov:Entity .',
    ':prep_considered_summary prov:wasGeneratedBy :prep_considered .',
    ':prep_considered_summary rdfs:label "3b Preprocessing Steps Considered but Not Applied" .',
    f':prep_considered_summary rdfs:comment """{prep_considered_comment}""" .',
]
engine.insert(prep_considered_activity, prefixes=prefixes)


In [None]:
#############################################
# Data Preparation – 3c Derived Attributes (Options & Potential)
#############################################

prep_derived_code_writer = student_b
start_time_prep_derived = now()
end_time_prep_derived = now()

prep_derived_ass_uuid_executor = "88847c41-7e7b-4354-a4e5-40950944c7b0"

prep_derived_executor = [
    ':prep_derived_attributes rdf:type prov:Activity .',
    ':prep_derived_attributes sc:isPartOf :data_preparation_phase .',
    f':prep_derived_attributes prov:qualifiedAssociation :{prep_derived_ass_uuid_executor} .',
    f':{prep_derived_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{prep_derived_ass_uuid_executor} rdf:type prov:Association .',
    f':{prep_derived_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
    f':prep_derived_attributes prov:startedAtTime "{start_time_prep_derived}"^^xsd:dateTime .',
    f':prep_derived_attributes prov:endedAtTime "{end_time_prep_derived}"^^xsd:dateTime .',
]
engine.insert(prep_derived_executor, prefixes=prefixes)

prep_derived_ass_uuid_writer = "59fb3895-fa42-4b28-bc84-f09abc0cc18f"

prep_derived_comment = """
Data Preparation 3c - Options and potential for derived attributes:

Several derived attributes could potentially improve model performance or interpretability:

• Screen-related features:
  - pixel_area = px_height * px_width (proxy for screen resolution / sharpness)
  - screen_area = sc_h * sc_w (approximate physical display size)
  - pixel_density_ratio = pixel_area / screen_area (if sc_w and sc_h are reliable)

• Performance / capacity ratios:
  - ram_per_internal_memory = ram / int_memory (relative memory configuration)
  - battery_per_weight = battery_power / mobile_wt (capacity relative to device weight)
  - camera_total_mp = fc + pc (overall camera capability)

• Connectivity and feature counts:
  - connectivity_score = blue + three_g + four_g + wifi (simple feature count)
  - feature_richness = connectivity_score + touch_screen + dual_sim

These attributes could better capture interactions between existing variables and might help
models distinguish between devices within the same price_range. For now, they are documented
as options; actual creation will be decided based on model needs and complexity trade-offs.
"""

prep_derived_activity = [
    ':prep_derived_attributes_summary rdf:type prov:Entity .',
    ':prep_derived_attributes_summary prov:wasGeneratedBy :prep_derived_attributes .',
    ':prep_derived_attributes_summary rdfs:label "3c Derived Attribute Options" .',
    f':prep_derived_attributes_summary rdfs:comment """{prep_derived_comment}""" .',
]
engine.insert(prep_derived_activity, prefixes=prefixes)


In [None]:
#############################################
# Data Preparation – 3d External Data & Additional Attributes
#############################################

prep_external_code_writer = student_b
start_time_prep_external = now()
end_time_prep_external = now()

prep_external_ass_uuid_executor = "3c1838a8-f928-444d-870c-520536819ae2"

prep_external_executor = [
    ':prep_external_data rdf:type prov:Activity .',
    ':prep_external_data sc:isPartOf :data_preparation_phase .',
    f':prep_external_data prov:qualifiedAssociation :{prep_external_ass_uuid_executor} .',
    f':{prep_external_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{prep_external_ass_uuid_executor} rdf:type prov:Association .',
    f':{prep_external_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
    f':prep_external_data prov:startedAtTime "{start_time_prep_external}"^^xsd:dateTime .',
    f':prep_external_data prov:endedAtTime "{end_time_prep_external}"^^xsd:dateTime .',
]
engine.insert(prep_external_executor, prefixes=prefixes)

prep_external_ass_uuid_writer = "648e4f19-635c-4b92-bbb4-b356285a91d3"

prep_external_comment = """
Data Preparation 3d - Options for additional external data sources and attributes:

The current dataset contains only technical specifications and an abstract price_range label.
Several external data sources could improve the alignment with the business goal of realistic
pricing and market positioning:

• Real retail prices:
  Link each device (or representative configurations) to historical market prices from
  online shops or price comparison portals. This would allow:
  - Training a regression model for actual price
  - Validating whether the price_range labels reflect realistic price bands

• Brand and model metadata:
  Add manufacturer brand, model family, and release year from public product catalogs.
  These attributes could capture brand effects and generation effects (newer vs. older
  technology) that strongly influence perceived value.

• Market segment and region:
  If available, add information about target market segment (e.g., budget, mid-range,
  flagship) or region (e.g., EU, US, Asia). This would allow more fine-grained analysis
  of price expectations across markets.

• User / expert ratings:
  External quality ratings (camera score, battery score, display rating) aggregated from
  review sites could help connect technical specs to perceived quality and justify
  differences within the same price_range.

These sources are hypothetical and not integrated in this project, but documenting them
clarifies how the dataset could be extended to support richer pricing and marketing analysis.
"""

prep_external_activity = [
    ':prep_external_data_summary rdf:type prov:Entity .',
    ':prep_external_data_summary prov:wasGeneratedBy :prep_external_data .',
    ':prep_external_data_summary rdfs:label "3d External Data & Additional Attributes" .',
    f':prep_external_data_summary rdfs:comment """{prep_external_comment}""" .',
]
engine.insert(prep_external_activity, prefixes=prefixes)


In [None]:
#############################################
# Data Preparation – Summary of Preprocessing Decisions
#############################################

prep_summary_code_writer = student_b
start_time_prep_summary = now()
end_time_prep_summary = now()

prep_summary_ass_uuid_executor = "a89dc405-1428-4840-b75f-c25d9f0ceacb"

prep_summary_executor = [
    ':prep_summary rdf:type prov:Activity .',
    ':prep_summary sc:isPartOf :data_preparation_phase .',
    f':prep_summary prov:qualifiedAssociation :{prep_summary_ass_uuid_executor} .',
    f':{prep_summary_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{prep_summary_ass_uuid_executor} rdf:type prov:Association .',
    f':{prep_summary_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
    f':prep_summary prov:startedAtTime "{start_time_prep_summary}"^^xsd:dateTime .',
    f':prep_summary prov:endedAtTime "{end_time_prep_summary}"^^xsd:dateTime .',
]
engine.insert(prep_summary_executor, prefixes=prefixes)

prep_summary_ass_uuid_writer = "bf36fde6-d754-4ae6-9d0e-5a272074f34a"

prep_summary_comment = """
Summary of Data Preparation Decisions:

Based on the Data Understanding phase, additional preprocessing steps such as global scaling,
binning, categorical encoding, and outlier removal were considered but intentionally not applied.

• Scaling:
  Scaling is not applied globally because only certain models (e.g., SVM) require it. Scaling
  will be performed inside model-specific pipelines rather than altering the original dataset.

• Binning:
  No binning was applied because numerical attributes already show meaningful continuous ranges
  and the target price_range classes are balanced. There is no business or modeling justification
  for discretization.

• Outlier Removal:
  Although outliers were detected in the fc feature, they were retained. Their domain validity is
  uncertain, and premature removal could discard meaningful variation. These will be revisited
  only if models show sensitivity.

• Noise Values:
  Noise-like values in sc_w and px_height were identified but not cleaned, as these may represent
  early-generation devices. Models planned for later (e.g., SVM, Decision Tree, RF) are robust to
  such noise.

• Encoding:
  No encoding was necessary because all categorical features are already numeric (0/1 or ordinal).

• Feature Removal:
  Although Pearson correlation was analyzed in the Data Understanding phase, features were not
  removed solely based on low linear correlation. Non-linear relationships will be captured by
  model-based feature importance.

Conclusion:
Only essential checks (duplicates, missing values, noise tagging) were performed. All other
transformations were deferred to the modeling phase or deemed unnecessary given the dataset’s
clean and structured nature.
"""

prep_summary_activity = [
    ':prep_summary_entity rdf:type prov:Entity .',
    ':prep_summary_entity rdfs:label "Data Preparation Summary" .',
    f':prep_summary_entity rdfs:comment """{prep_summary_comment}""" .',
    ':prep_summary_entity prov:wasGeneratedBy :prep_summary .',
]
engine.insert(prep_summary_activity, prefixes=prefixes)


In [None]:
# Your final transformed dataset should also be documented appropriately using Croissant, SI, etc.

#############################################
# Documentation – Final Prepared Dataset
#############################################

prepared_data_triples = [
    ':prepared_data rdf:type prov:Entity .',
    # The prepared dataset is the same as the original loaded dataset,
    # because no modifying preprocessing (e.g., removal, imputation) was applied.
    ':prepared_data prov:wasDerivedFrom :data .',
    ':prepared_data rdf:type sc:Dataset .',
    # You may add fields, recordsets, etc., if needed:
    # ':prepared_recordset rdf:type cr:RecordSet .',
    # ':prepared_data cr:recordSet :prepared_recordset .',
    # ':prepared_recordset cr:field :field_x .',
]
engine.insert(prepared_data_triples, prefixes=prefixes)

## Modeling

In [None]:
## Each Activity that follows is part of the Modeling Phase

modeling_phase_executor = [
f':modeling_phase rdf:type prov:Activity .',
f':modeling rdfs:label "Modeling Phase" .', 
]
engine.insert(modeling_phase_executor, prefixes=prefixes)


In [None]:
model_data_code_writer = student_a

#############################################
# Documentation 4a
#############################################

dma_ass_uuid_writer = "561e678f-c355-431d-a956-74d642cdd8c4"
dma_comment = """
...
"""

identify_data_mining_algorithm_activity = [
    f':define_algorithm rdf:type prov:Activity .',
    f':define_algorithm sc:isPartOf :modeling_phase .',
    f':define_algorithm rdfs:comment """{dma_comment}""" .',
    f':define_algorithm prov:qualifiedAssociation :{dma_ass_uuid_writer} .',
    f':{dma_ass_uuid_writer} prov:agent :{model_data_code_writer} .',
    f':{dma_ass_uuid_writer} rdf:type prov:Association .',
    f':{dma_ass_uuid_writer} prov:hadRole :{code_writer_role} .',
    
    # example algorithm definition
    f':random_forest_algorithm rdf:type mls:Algorithm .',
    f':random_forest_algorithm rdfs:label "Random Forest Algorithm" .',

    # example implementation
    f':random_forrest_classifier_implementation rdf:type mls:Implementation .',
    f':random_forrest_classifier_implementation rdfs:label "Scikit-learn RandomForestClassifier" .',
    f':random_forrest_classifier_implementation mls:implements :random_forest_algorithm .',
    f':random_forrest_classifier_implementation prov:wasGeneratedBy :define_algorithm .',

    
    # you can also define your Evaluation Measures here
    
    # example evaluation 
    f':r2_score_measure rdf:type mls:EvaluationMeasure .',
    f':r2_score_measure rdfs:label "R-squared Score" .',
    f':r2_score_measure rdfs:comment "xxx" .',
    f':r2_score_measure prov:wasGeneratedBy :define_algorithm .',

    
]
engine.insert(identify_data_mining_algorithm_activity, prefixes=prefixes)

In [None]:
#############################################
# Documentation 4b
#############################################

hp_ass_uuid_writer = "7e5b5cb5-3755-4a7d-8409-611b11da0bdf"
hp_comment = """
...
"""
identify_hp_activity = [
    f':identify_hyperparameters rdf:type prov:Activity .',
    f':identify_hyperparameters sc:isPartOf :modeling_phase .',
    f':identify_hyperparameters rdfs:comment """{hp_comment}""" .',
    f':identify_hyperparameters prov:qualifiedAssociation :{hp_ass_uuid_writer} .',
    f':{hp_ass_uuid_writer} prov:agent :{model_data_code_writer} .',
    f':{hp_ass_uuid_writer} rdf:type prov:Association .',
    f':{hp_ass_uuid_writer} prov:hadRole :{code_writer_role} .',
    
    # example parameter
    f':hp_learning_rate rdf:type mls:HyperParameter .',
    f':hp_learning_rate rdfs:label "Learning Rate" .',
    f':hp_learning_rate rdfs:comment "..." .',
    f':random_forrest_classifier_implementation mls:hasHyperParameter :hp_learning_rate .',
    f':hp_learning_rate prov:wasGeneratedBy :identify_hyperparameters .',

    # continue with your identified hyperparameters
    
]
engine.insert(identify_hp_activity, prefixes=prefixes)

In [None]:
def split_data(df: pd.DataFrame):
    #do something
    return 'train_set', 'validation_set', 'test_set'

#############################################
# Documentation 4c
#############################################

### Define Train/Validation/Test splits
split_ass_uuid_writer = "2de4d933-67e9-4f29-8db5-500c3e27c782"
split_comment = """
...
"""
## Use your prepared dataset
input_dataset = ":prepared_data" 

define_split_activity = [
    f':define_data_split rdf:type prov:Activity .',
    f':define_data_split sc:isPartOf :modeling_phase .',
    f':define_data_split rdfs:comment "Train/Validation/Test Split Definition" .',
    f':define_data_split rdfs:comment """{split_comment}""" .',
    f':define_data_split prov:qualifiedAssociation :{split_ass_uuid_writer} .',
    f':{split_ass_uuid_writer} prov:agent :{model_data_code_writer} .',
    f':{split_ass_uuid_writer} rdf:type prov:Association .',
    f':{split_ass_uuid_writer} prov:hadRole :{code_writer_role} .',
    f':define_data_split prov:used {input_dataset} .',
    
    # Training Set
    f':training_set rdf:type sc:Dataset .',
    f':training_set rdfs:label "Training Set" .',
    f':training_set prov:wasGeneratedBy :define_data_split .',
    f':training_set prov:wasDerivedFrom {input_dataset} .',
    f':training_set rdfs:comment "Contains xx samples" .', 

    # Validation Set
    f':validation_set rdf:type sc:Dataset .',
    f':validation_set rdfs:label "Validation Set" .',
    f':validation_set prov:wasGeneratedBy :define_data_split .',
    f':validation_set prov:wasDerivedFrom {input_dataset} .',
    f':validation_set rdfs:comment "Contains xx samples" .', 

    # Test Set
    f':test_set rdf:type sc:Dataset .',
    f':test_set rdfs:label "Test Set" .',
    f':test_set prov:wasGeneratedBy :define_data_split .',
    f':test_set prov:wasDerivedFrom {input_dataset} .',
    f':test_set rdfs:comment "Contains xx samples" .', 

    
]
engine.insert(define_split_activity, prefixes=prefixes)

In [None]:
def train_and_finetune_model(training_set, validation_set):
    # do something here

    # Try to automate as much documentation work as possible.
    # Define your training runs with their respective hyperparameter settings, etc.
    # Document each time a training run, model, its hp_settings, evaluations, ...  
    # Create performance figures/graphs

    return 'Find most suitable model'


start_time_tafm = now()
# train_and_finetune_model()
end_time_tafm = now() 


#############################################
# Documentation 4d & e & f
#############################################

tafm_ass_uuid_writer = "4b46db41-df33-4105-8596-bf26b3be56c7"
tafm_comment = """
...
"""

# EXAMPLE output from your training
training_run1 = "run_1" 
model_run1 = "model_run1"
hp1_setting_run1 = "hp_setting_run1"
eval_train_run1 = "metric_train_run1"
eval_validation_run1 = "metric_validation_run1"


train_model_activity = [
    # Activity 
    f':train_and_finetune_model rdf:type prov:Activity .',
    f':train_and_finetune_model sc:isPartOf :modeling_phase .',
    f':train_and_finetune_model rdfs:comment """{tafm_comment}""" .',
    f':train_and_finetune_model prov:startedAtTime "{start_time_tafm}"^^xsd:dateTime .',
    f':train_and_finetune_model prov:endedAtTime "{end_time_tafm}"^^xsd:dateTime .',
    f':train_and_finetune_model prov:qualifiedAssociation :{tafm_ass_uuid_writer} .',
    f':{tafm_ass_uuid_writer} prov:agent :{model_data_code_writer} .',
    f':{tafm_ass_uuid_writer} rdf:type prov:Association .',
    f':{tafm_ass_uuid_writer} prov:hadRole :{code_writer_role} .',
    
    ########################################
    # ONE model run - automate everything below!

    # Parameter settings
    f':{hp1_setting_run1} rdf:type mls:HyperParameterSetting .',
    f':{hp1_setting_run1} mls:specifiedBy :hp_learning_rate .',
    f':{hp1_setting_run1} mls:hasValue "1.23"^^xsd:double .',
    f':{hp1_setting_run1} prov:wasGeneratedBy :train_and_finetune_model .',
    # add your further parameters

    # Describe your Run
    f':{training_run1} rdf:type mls:Run .',
    f':{training_run1} sc:isPartOf :train_and_finetune_model .',
    f':{training_run1} mls:realizes :random_forest_algorithm .',
    f':{training_run1} rdf:label "Training Run 1 with..." .',
    f':{training_run1} mls:executes :your_implementation .', 
    f':{training_run1} mls:hasInput :training_set .',
    f':{training_run1} mls:hasInput :validation_set .',
    f':{training_run1} mls:hasInput :{hp1_setting_run1} .',     
    # list all your used parameters here
    f':{training_run1} mls:hasOutput :{model_run1} .',
    f':{training_run1} mls:hasOutput :{eval_train_run1} .',
    f':{training_run1} mls:hasOutput :{eval_validation_run1} .',

    # Describe your Model
    f':{model_run1} rdf:type mls:Model .',
    f':{model_run1} prov:label "xxx" .',
    f':{model_run1} prov:wasGeneratedBy :{training_run1} .',
    f':{model_run1} mlso:trainedOn :training_set .',
    f':{model_run1} mlso:hasAlgorithmType :random_forest_algorithm .',

    # Describe your evaluations
    # You can have multiple evaluations per model 
    f':{eval_train_run1} rdf:type mls:ModelEvaluation .',
    f':{eval_train_run1} prov:wasGeneratedBy :{training_run1} .',
    f':{eval_train_run1} mls:hasValue "1.23"^^xsd:double .',
    f':{eval_train_run1} mls:specifiedBy :r2_score_measure .',
    f':{eval_train_run1} prov:used :training_set .',

    f':{eval_validation_run1} rdf:type mls:ModelEvaluation .',
    f':{eval_validation_run1} prov:wasGeneratedBy :{training_run1} .',
    f':{eval_validation_run1} mls:hasValue "1.23"^^xsd:double .',
    f':{eval_validation_run1} mls:specifiedBy :r2_score_measure .',
    f':{eval_validation_run1} prov:used :validation_set .',

    # Dont forget to document any visualizations

]
engine.insert(train_model_activity, prefixes=prefixes)


In [None]:
def retrain_model_full_data(training_set, validation_set):
    
    # create your
    return "Final Trained Model"


start_time_tafm = now()
# train_and_finetune_model()
end_time_tafm = now() 


#############################################
# Documentation 4g
#############################################

retrain_ass_uuid_writer = "19f16231-e2f1-4d97-9e63-2be319d0f66f" # Generate once

final_training_activity = ":retrain_final_model"
final_model = ":final_model_entity"

# Document the retraining activity.
# Hint: This activity is still part of the :modeling_phase

retrain_documentation = [
    # your documentation here    
]
engine.insert(retrain_documentation, prefixes=prefixes)


## Evaluation

In [None]:
## Each Activity that follows is part of the Evaluation Phase

evaluation_phase_executor = [
f':evaluation_phase rdf:type prov:Activity .',
f':evaluation_phase rdfs:label "Evaluation Phase" .', 
]
engine.insert(evaluation_phase_executor, prefixes=prefixes)

In [None]:
eval_code_writer = student_b
def evaluate_on_test_data(final_model, test_set):

    # Predict and evaluation on test data
        
    return 'Performance'

start_time_eval = now()
#evaluate_on_test_data()
end_time_eval = now() 

#############################################
# Documentation
#############################################

eval_ass_uuid = "f4e6b5f2-8529-434a-8d83-be38d9ef9ee1" # Generate once
final_model = ":final_model_entity" 
test_set = ":test_set" 

eval_comment = """
...
"""

evaluate_activity = [
    f':evaluate_final_model rdf:type prov:Activity .',
    f':evaluate_final_model sc:isPartOf :evaluation_phase .',
    f':evaluate_final_model rdfs:label "Final Model Evaluation on Test Set" .',
    f':evaluate_final_model rdfs:comment """{eval_comment}""" .',
    f':evaluate_final_model prov:startedAtTime "{start_time_eval}"^^xsd:dateTime .',
    f':evaluate_final_model prov:endedAtTime "{end_time_eval}"^^xsd:dateTime .',
    f':evaluate_final_model prov:qualifiedAssociation :{eval_ass_uuid} .',
    
    f':{eval_ass_uuid} prov:agent :{eval_code_writer} .',
    f':{eval_ass_uuid} rdf:type prov:Association .',
    f':{eval_ass_uuid} prov:hadRole :{code_writer_role} .',

    # Inputs
    f':evaluate_final_model prov:used {final_model} .',
    f':evaluate_final_model prov:used {test_set} .',
    
    # Reference to Data Mining Success Criteria from Phase 1
    f':evaluate_final_model prov:used :bu_data_mining_success_criteria .',

    # Document you final model performance
 
    # Hint: you evaluate bias in this way:
    f':bias_evaluation_result rdf:type mls:ModelEvaluation .',
    f':bias_evaluation_result prov:wasGeneratedBy :evaluate_final_model .',
    f':bias_evaluation_result rdfs:label "Bias Analysis" .',
    f':bias_evaluation_result rdfs:comment "..." .',
    
]
engine.insert(evaluate_activity, prefixes=prefixes)

## Deployment

In [None]:
## Each Activity that follows is part of the Deployment Phase

deployment_phase_executor = [
f':deployment_phase rdf:type prov:Activity .',
f':deployment_phase rdfs:label "Deployment Phase" .', 
]
engine.insert(deployment_phase_executor, prefixes=prefixes)

In [None]:
#############################################
# Documentation
#############################################

comparison_and_recommendations_comment = """
...
"""

ethical_aspects_comment = """
...
"""

monitoring_plan_comment = """
...
"""

reproducibility_reflection_comment = """
...
"""

dep_ass_uuid_executor = "08a1554a-5c38-4eb1-bb25-499b068c14ab" # Generate once
deployment_executor = [
f':plan_deployment rdf:type prov:Activity .',
f':plan_deployment sc:isPartOf :deployment_phase .', # Connect to Parent Phase
f':plan_deployment rdfs:label "Plan Deployment"@en .',

f':plan_deployment prov:qualifiedAssociation :{dep_ass_uuid_executor} .',
f':{dep_ass_uuid_executor} prov:agent :{executed_by} .',
f':{dep_ass_uuid_executor} rdf:type prov:Association .',
f':{dep_ass_uuid_executor} prov:hadRole :{code_executor_role} .', 
]
engine.insert(deployment_executor, prefixes=prefixes)


deployment_data_executor = [
#6a
f':dep_recommendations rdf:type prov:Entity .',
f':dep_recommendations prov:wasGeneratedBy :plan_deployment .',
f':dep_recommendations rdfs:label "6a Business Objectives Reflection and Deployment Recommendations" .',
f':dep_recommendations rdfs:comment """{comparison_and_recommendations_comment}""" .',
#6b
f':dep_ethical_risks rdf:type prov:Entity .',
f':dep_ethical_risks prov:wasGeneratedBy :plan_deployment .',
f':dep_ethical_risks rdfs:label "6b Ethical Aspects and Risks" .',
f':dep_ethical_risks rdfs:comment """{ethical_aspects_comment}""" .',
#6c
f':dep_monitoring_plan rdf:type prov:Entity .',
f':dep_monitoring_plan prov:wasGeneratedBy :plan_deployment .',
f':dep_monitoring_plan rdfs:label "6c Monitoring Plan" .',
f':dep_monitoring_plan rdfs:comment """{monitoring_plan_comment}""" .',
#6d
f':dep_reproducibility_reflection rdf:type prov:Entity .',
f':dep_reproducibility_reflection prov:wasGeneratedBy :plan_deployment .',
f':dep_reproducibility_reflection rdfs:label "6d Reproducibility Reflection" .',
f':dep_reproducibility_reflection rdfs:comment """{reproducibility_reflection_comment}""" .',

]
engine.insert(deployment_data_executor, prefixes=prefixes)

# Generate Latex Report

The following cells give you an example of how to automatically create a Latex Report from your provenance documentation.

Feel free to use the example provided. If you use it, you should adapt and extend it with relevant sections/tables/plots/... 

In [None]:
base_iri = f"https://starvers.ec.tuwien.ac.at/BI2025/{group_id}/"

In [None]:
# This cell includes cleaning functions

from datetime import datetime

def latex_escape(text: str | None) -> str:
    if text is None: return ""
    text = str(text)
    text = text.replace("\\", r"\textbackslash{}")
    pairs = [
        ("&", r"\&"), ("%", r"\%"), ("$", r"\$"), ("#", r"\#"), 
        ("_", r"\_"), ("{", r"\{"), ("}", r"\}"), 
        ("~", r"\textasciitilde{}"), ("^", r"\textasciicircum{}")
    ]
    for k, v in pairs:
        text = text.replace(k, v)
    return text

def clean_rdf(x) -> str:
    if hasattr(x, "toPython"): return str(x.toPython())
    if x is None: return ""
    s = str(x).strip()
    s = s.strip('"').strip("'")
    s = s.strip()
    if "^^" in s:
        s = s.split("^^")[0].strip('"')
        
    return s

def fmt_iso(ts: str) -> str:
    if not ts: return ""
    try:
        clean_ts = ts.split("^^")[0].strip('"')
        clean_ts = clean_ts.replace("Z", "+00:00") if clean_ts.endswith("Z") else clean_ts
        return datetime.fromisoformat(clean_ts).strftime("%Y-%m-%d %H:%M:%S")
    except:
        return latex_escape(str(ts))

In [None]:
# This cell includes exemplary queries for different phases


### Author Block
author_query = f"""
{prefix_header}
PREFIX iao: <http://purl.obolibrary.org/obo/>

SELECT DISTINCT ?uri ?given ?family ?matr WHERE {{
  VALUES ?uri {{ :{student_a} :{student_b} }}
  
  ?uri a foaf:Person .
  ?uri foaf:givenName ?given .
  ?uri foaf:familyName ?family .
  ?uri iao:IAO_0000219 ?matr .
}}
"""

res_authors = engine.query(author_query)
author_block_latex = ""

if not res_authors.empty: # type:ignore
    for _, row in res_authors.iterrows(): # type:ignore

        uri_str = str(row['uri'])
        given = latex_escape(clean_rdf(row['given']))
        family = latex_escape(clean_rdf(row['family']))
        matr = latex_escape(clean_rdf(row['matr']))
        if student_a in uri_str:
            responsibility = "Student A"
        elif student_b in uri_str:
            responsibility = "Student B"
        else:
            responsibility = "Student"
        
        author_block_latex += rf"""
          \author{{{given} {family}}}
          \authornote{{{responsibility}, Matr.Nr.: {matr}}}
          \affiliation{{
            \institution{{TU Wien}}
            \country{{Austria}}
          }}
          """

### Business Understanding example
bu_query = f"""
{prefix_header}

SELECT ?ds_comment ?bo_comment WHERE {{
  OPTIONAL {{ :bu_data_source_and_scenario rdfs:comment ?ds_comment . }}
  OPTIONAL {{ :bu_business_objectives rdfs:comment ?bo_comment . }}
}} LIMIT 1
"""
res_bu = engine.query(bu_query)
row_bu = res_bu.iloc[0] if not res_bu.empty else {} # type:ignore
bu_data_source = latex_escape(clean_rdf(row_bu.get("ds_comment", "")))
bu_objectives  = latex_escape(clean_rdf(row_bu.get("bo_comment", "")))


### Data Understanding examples
# Example Dataset Description
du_desc_query = f"""
{prefix_header}
SELECT ?desc WHERE {{ :raw_data sc:description ?desc . }} LIMIT 1
"""
res_du_desc = engine.query(du_desc_query)
row_du_desc = res_du_desc.iloc[0] if not res_du_desc.empty else {} # type:ignore
du_description = latex_escape(clean_rdf(row_du_desc.get("desc", "")))

# Example Feature Columns Table
du_query = f"""
{prefix_header}

SELECT ?name (SAMPLE(?dtypeRaw) as ?dtype) (SAMPLE(?descRaw) as ?desc) WHERE {{
  :raw_data cr:recordSet ?rs .
  ?rs cr:field ?field .
  ?field sc:name ?name .
  ?field sc:description ?descRaw .
  ?field cr:dataType ?dtypeRaw .
}} 
GROUP BY ?name
ORDER BY ?name
"""
res_du = engine.query(du_query)
du_rows = []
if not res_du.empty: # type:ignore
    for _, f in res_du.iterrows(): # type:ignore
        dtype_raw = clean_rdf(f.get("dtype", ""))
        if '#' in dtype_raw: dtype = dtype_raw.split('#')[-1]
        elif '/' in dtype_raw: dtype = dtype_raw.split('/')[-1]
        else: dtype = dtype_raw
        
        desc = clean_rdf(f.get("desc", ""))
        row_str = f"{latex_escape(clean_rdf(f['name']))} & {latex_escape(dtype)} & {latex_escape(desc)} \\\\"
        du_rows.append(row_str)
du_table_rows = "\n    ".join(du_rows)

### Modeling example
# Hyperparameters
hp_query = f"""
{prefix_header}

SELECT ?hpName (SAMPLE(?hpValRaw) as ?hpVal) (MAX(?hpDescRaw) as ?hpDesc) WHERE {{
  ?run sc:isPartOf :train_and_finetune_model .
  ?run mls:hasInput ?setting .
  ?setting a mls:HyperParameterSetting .
  ?setting mls:hasValue ?hpValRaw .
  ?setting mls:specifiedBy ?hpDef .
  ?hpDef rdfs:label ?hpName .
  OPTIONAL {{ ?hpDef rdfs:comment ?hpDescRaw . }}
}} 
GROUP BY ?hpName
ORDER BY ?hpName
"""
res_hp = engine.query(hp_query)
hp_rows = []
if not res_hp.empty: #type:ignore
    for _, row in res_hp.iterrows(): #type:ignore
        name = latex_escape(clean_rdf(row['hpName']))
        val  = latex_escape(clean_rdf(row['hpVal']))
        desc = latex_escape(clean_rdf(row.get('hpDesc', '')))
        hp_rows.append(rf"{name} & {desc} & {val} \\")

hp_table_rows = "\n    ".join(hp_rows)

# Run Info
run_query = f"""
{prefix_header}

SELECT ?algoLabel ?start ?end ?metricLabel ?metricVal WHERE {{
  OPTIONAL {{ :train_and_finetune_model prov:startedAtTime ?start ; prov:endedAtTime ?end . }}
  OPTIONAL {{
      ?run sc:isPartOf :train_and_finetune_model .
      ?run mls:realizes ?algo .
      ?algo rdfs:label ?algoLabel .
  }}
  OPTIONAL {{
    ?run sc:isPartOf :train_and_finetune_model .
    ?run mls:hasOutput ?eval .
    ?eval a mls:ModelEvaluation ; mls:hasValue ?metricVal .
    OPTIONAL {{ ?eval mls:specifiedBy ?m . ?m rdfs:label ?metricLabel . }}
  }}
}} LIMIT 1
"""
res_run = engine.query(run_query)
row_run = res_run.iloc[0] if not res_run.empty else {} #type:ignore
mod_algo  = latex_escape(clean_rdf(row_run.get("algoLabel", "")))
mod_start = latex_escape(fmt_iso(clean_rdf(row_run.get("start"))))
mod_end   = latex_escape(fmt_iso(clean_rdf(row_run.get("end"))))
mod_m_lbl = latex_escape(clean_rdf(row_run.get("metricLabel", "")))
raw_val = clean_rdf(row_run.get('metricVal', ''))
mod_m_val = f"{float(raw_val):.4f}" if raw_val else ""

print("Data extraction done.")

The following includes the Latex report itself. It fills in the query-results from the cell before. The ACM Template is already filled. 
Make sure that you update Student A and B accordingly.

In [None]:
latex_content = rf"""\documentclass[sigconf]{{acmart}}

\AtBeginDocument{{ \providecommand\BibTeX{{ Bib\TeX }} }}
\setcopyright{{acmlicensed}}
\copyrightyear{{2025}}
\acmYear{{2025}}
\acmDOI{{XXXXXXX.XXXXXXX}}

\acmConference[BI 2025]{{Business Intelligence}}{{-}}{{-}}

\begin{{document}}

\title{{BI2025 Experiment Report - Group {group_id}}}
%% ---Authors: Dynamically added ---
{author_block_latex}

\begin{{abstract}}
  This report documents the machine learning experiment for Group {group_id}, following the CRISP-DM process model.
\end{{abstract}}

\ccsdesc[500]{{Computing methodologies~Machine learning}}
\keywords{{CRISP-DM, Provenance, Knowledge Graph, Machine Learning}}

\maketitle

%% --- 1. Business Understanding ---
\section{{Business Understanding}}

\subsection{{Data Source and Scenario}}
{bu_data_source}

\subsection{{Business Objectives}}
{bu_objectives}

%% --- 2. Data Understanding ---
\section{{Data Understanding}}
\textbf{{Dataset Description:}} {du_description}

The following features were identified in the dataset:

\begin{{table}}[h]
  \caption{{Raw Data Features}}
  \label{{tab:features}}
  \begin{{tabular}}{{lp{{0.2\linewidth}}p{{0.4\linewidth}}}}
    \toprule
    \textbf{{Feature Name}} & \textbf{{Data Type}} & \textbf{{Description}} \\
    \midrule
    {du_table_rows}
    \bottomrule
  \end{{tabular}}
\end{{table}}

%% --- 3. Data Preparation ---
\section{{Data Preparation}}
\subsection{{Data Cleaning}}
Describe your Data preparation steps here and include respective graph data.


%% --- 4. Modeling ---
\section{{Modeling}}

\subsection{{Hyperparameter Configuration}}
The model was trained using the following hyperparameter settings:

\begin{{table}}[h]
  \caption{{Hyperparameter Settings}}
  \label{{tab:hyperparams}}
  \begin{{tabular}}{{lp{{0.4\linewidth}}l}}
    \toprule
    \textbf{{Parameter}} & \textbf{{Description}} & \textbf{{Value}} \\
    \midrule
    {hp_table_rows}
    \bottomrule
  \end{{tabular}}
\end{{table}}

\subsection{{Training Run}}
A training run was executed with the following characteristics:
\begin{{itemize}}
    \item \textbf{{Algorithm:}} {mod_algo}
    \item \textbf{{Start Time:}} {mod_start}
    \item \textbf{{End Time:}} {mod_end}
    \item \textbf{{Result:}} {mod_m_lbl} = {mod_m_val}
\end{{itemize}}

%% --- 5. Evaluation ---
\section{{Evaluation}}

%% --- 6. Deployment ---
\section{{Deployment}}

\section{{Conclusion}}

\end{{document}}
"""

In [None]:
# This cell stores the Latex report to the data/report directory

out_dir = os.path.join("data", "report")
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, "experiment_report.tex")

with open(out_path, "w", encoding="utf-8") as f:
    f.write(latex_content)

print(f"Report written to: {out_path}")