In [1]:
# General Imports

import numpy as np
from numpy import random as rnd
from matplotlib import pyplot as plt
import os,sys,datetime,time,math, warnings,itertools

import pandas as pd
from contextlib import contextmanager

from rdflib import Namespace, URIRef
from owlready2 import *
from urllib.parse import quote



In [2]:
@contextmanager
def cwd(path):
    """
    Context manager to temporarily change the current working directory.

    This context manager changes the current working directory to the specified 'path'
    while the context is active. After the context exits, the original working directory
    is restored.

    Args:
        path (str): The path to the directory to which the current working directory should be changed.

    Usage:
        with cwd('/path/to/new/directory'):
            # Code executed within this block will have the current working directory changed.
            # After the block exits, the original working directory is restored.

    Example:
        with cwd('/home/user/documents'):
            file_list = os.listdir()  # List files in the '/home/user/documents' directory
        # 
    """
    oldpwd = os.getcwd()
    os.chdir(path)
    try: yield
    finally: os.chdir(oldpwd)

In [3]:
def create_dqo_uri(name: str) -> URIRef:
    """
    Create an RDF URI using the EG namespace and a quoted version of the input name.

    This function takes a string `name` and creates a URIRef object using the EG namespace
    and a quoted version of the input name where spaces are replaced with underscores.

    Parameters:
    name (str): The input name for which the URI is to be created.

    Returns:
    URIRef: A URIRef object representing the generated RDF URI.

    Example:
    >>> create_eg_uri("John Doe")
    rdflib.term.URIRef('http://example.com/ns#John_Doe')
    """
    quoted = quote(name.replace(" ", "_"))
    return DQO['#'+quoted]

In [4]:
def list_data_properties_per_individual(onto, indname, verbose=True, retval=False):
    """
    List data properties and their values for a given individual in an ontology.

    Parameters:
    - onto: owlready2 ontology instance
        The ontology containing the individual and data properties.
    - indname: str
        The name of the individual for which to list data properties.
    - verbose: bool, optional (default=True)
        If True, print data property names and values to the console.
    - retval: bool, optional (default=False)
        If True, return a dictionary containing data property names and values.

    Returns:
    - r: dict or None
        A dictionary containing data property names and values for the individual,
        if `retval` is True. Otherwise, returns None.
    """
    # Retrieve the individual from the ontology using the provided name
    individual = onto[indname]
    # Initialize a dictionary to store data properties and their values
    r = dict()
    # Iterate over all data properties in the ontology
    for data_property in onto.data_properties():
        # Get the values of the data property for the individual
        values = getattr(individual, data_property.name)
        if values:
            # Store the data property name and its values in the dictionary
            r[data_property.name] = values
            # Print data property name and values if verbose is True
            if verbose: print('{0}: {1}'.format(data_property.name, values))
    # Return the dictionary of data properties and values if retval is True
    if retval: return r


In [5]:
def list_properties_per_individual(onto, indname, verbose=True, retval=False):
    """
    List properties and their values for a given individual in an ontology.

    Parameters:
    - onto: owlready2 ontology instance
        The ontology containing the individual and properties.
    - indname: str
        The name of the individual for which to list properties.
    - verbose: bool, optional (default=True)
        If True, print property names and values to the console.
    - retval: bool, optional (default=False)
        If True, return a dictionary containing property names and values.

    Returns:
    - r: dict or None
        A dictionary containing property names and values for the individual,
        if `retval` is True. Otherwise, returns None.
    """
    # Retrieve the individual from the ontology using the provided name
    individual = onto[indname]
    # Initialize a dictionary to store properties and their values
    r = dict()
    # Iterate over all properties in the ontology
    for property in onto.properties():
        # Get the values of the property for the individual
        values = getattr(individual, property.name)
        if values:
            # Store the property name and its values in the dictionary
            r[property.name] = values
            # Print property name and values if verbose is True
            if verbose: print('{0}: {1}'.format(property.name, values))
    # Return the dictionary of properties and values if retval is True
    if retval: return r


In [6]:
# Path settings
if sys.platform.startswith('win'):
    funnelflags_directory = r'../../TF_Data/Dropbox/PhD Prep/DQ Framework - Clustering/Data Products/'
    ontology_directory = r'../../TF_Data/Dropbox/PhD Prep/DQ Framework - Clustering/DQ Ontology/'
    shsdq_directory = r'../../TF_Data/Dropbox/PhD Prep/SHS DQ/'

In [7]:
# Read Excel files to construct columns in data product

with cwd(funnelflags_directory):
    df = pd.read_excel('Funnel Management Data Product.xlsx',sheet_name='DQ Flags per Data Product')
with cwd(shsdq_directory):
    dq = pd.read_excel('DQ Flags in CDC.xlsx').rename(columns={'Name':'COLUMN_NAME'})

In [8]:
onto = get_ontology(ontology_directory+'DQ Ontology NTRIPLES.owl').load()
onto_path.append(ontology_directory)

In [87]:
data_product = 'Funnel_Management'
data_elements = df['Asset Name'].unique().tolist()
target_tables = df['TABLE_PATH'].unique().tolist()

In [90]:
# Create Data Products
if onto[data_product] is None:
    # Create Data Product
    onto.Data_Product('Funnel Management'.replace(' ','_'))
    # Attach Status Active Offering
    st = onto.Active_Offering
    onto.Funnel_Management.Data_Product_has_Status.append(st)

In [114]:
# Create Data Elements
for k in data_elements:
    if onto[k] is None:
        onto.Data_Element(k.replace(' ','_'))
        onto.Funnel_Management.composed_of.append(onto[k])

In [115]:
onto.Data_Element.instances()

[DQ Ontology.Pricing_Element,
 DQ Ontology.Accounts,
 DQ Ontology.OLIs,
 DQ Ontology.Opportunities,
 DQ Ontology.ACTIVE_QUOTES_HEADER,
 DQ Ontology.Projects,
 DQ Ontology.IN_VITRO_Contract_KPIs,
 DQ Ontology.Proposals]

In [118]:
list_data_properties_per_individual(onto,'Funnel_Management')

In [116]:
list_properties_per_individual(onto,'Funnel_Management')

Data_Product_has_Status: [DQ Ontology.Active_Offering]
composed_of: [DQ Ontology.Accounts, DQ Ontology.OLIs, DQ Ontology.Opportunities, DQ Ontology.ACTIVE_QUOTES_HEADER, DQ Ontology.Projects, DQ Ontology.IN_VITRO_Contract_KPIs, DQ Ontology.Proposals]


In [125]:
# Enter new target_tables with properties
for k in target_tables:
    if onto[k] is None:
        onto.Technical_Target_Element(k.replace(' ','_'))
        onto[k].part_of_Product.append(onto['Funnel_Management'])
        onto[k].Target_Element_has_Status.append(onto.Active_Offering)
        onto[k].Target_Path.append(k)
        onto[k].Target_Path.append('TABLE')

In [138]:
# Enter part-of-element relations
for idx,x in df[['Asset Name','TABLE_PATH']].drop_duplicates().iterrows():
    if onto[x[1]] is None:
        onto[x[1]].part_of_Element.append(onto[x[0]])

In [118]:
tf = df.filter(['COLUMN_NAME','TABLE_PATH']).drop_duplicates().merge(dq[~dq['Table'].isin(['Sharepoint_Projects','ALL_QUOTES_HEADER','CS_Quote_Details'])],on='COLUMN_NAME',how='left')
tf['Metric'] = tf['COLUMN_NAME'].apply(lambda x: str(x).replace('(Flag)','').strip().replace(' ','_'))

In [119]:
ttf = tf[['COLUMN_NAME','Business Definition','Type','Dimension','Metric']].drop_duplicates()

In [105]:
for ind,x in ttf.iterrows():
    # Create Dimension
    if (x['Dimension'] is not None) and (str(x['Dimension'])!='nan'):
        c_dimension = x['Dimension'].replace(' ','_')
        if (len(c_dimension.split(','))==1) and (c_dimension not in [m.name for m in onto.Data_Quality_Dimension.instances()]):
            onto.Data_Quality_Dimension(c_dimension)
    # Create Metric
    if (x['Metric'] is not None) and (str(x['Metric'])!='nan'):
        c_metric = x['Metric'].replace(' ','_')
        if c_metric not in [m.name for m in onto.Metric.instances()]:
            onto.Metric(c_metric)
            if str(x['Type'])!='nan': onto[c_metric].Metric_Type.append(x['Type'])
            if str(x['Business Definition'])!='nan': onto[c_metric].Business_Definition.append(x['Business Definition'])

In [134]:
# Prepare scope limitations per metric
af = tf[['Metric','Scope Limitations']]

af['Scope Limitations'] = af['Scope Limitations'].str.split(', ')
af = af.explode('Scope Limitations')

af['Scope Limitations'] = af['Scope Limitations'].apply(lambda x: 'Siemens_Healthineers.'+str(x).replace(' ','_') if str(x)!='nan' else 'Siemens_Healthineers')

In [141]:
# Add scope limitations for metrics
for idx,x in af.iterrows():
    if onto[x['Scope Limitations']] is None:
        onto.Business_Scope(x['Scope Limitations'])
    onto[x['Metric']].has_scope_limit.append(onto[x['Scope Limitations']])
    

In [148]:
tf.columns

Index(['COLUMN_NAME', 'TABLE_PATH', 'Domain', 'Table', 'Schema', 'Type',
       'Criterion', 'Reported In (Main Source)',
       'Original Method of Definition', 'First Calculation', 'Dimension',
       'Check Outdated (Flag)', 'Outside of CDC Scope (Flag)', 'Comment',
       'Business Definition', 'Currently Active (Flag)', 'Scope Limitations',
       'Associated Checkpoint ID', 'Data Steward', 'Data Quality Lead',
       'Data Owner', 'Protege Ingested (Flag)', 'Metric'],
      dtype='object')

In [150]:
#list(onto.classes())
tf[['Metric','COLUMN_NAME','Criterion','First Calculation']]

Unnamed: 0,Metric,COLUMN_NAME,Criterion,First Calculation
0,Potential_Test_Account,Potential Test Account (Flag),"(lower(""Account Name"") LIKE '% test %' AND lo...",BOARDING_Accounts
1,Quote_Item_Reference_not_found,Quote Item Reference not found (Flag),"""Quote Item Id"" field filled in the OLIs table...",BOARDING_OLIs
2,Multiple_Quotes_attached,Multiple Quotes attached (Flag),"OLI_ID shows up in multiple different ""Quote N...",BOARDING_OLIs
3,DQ_Issue_In-Vitro_Opportunity_related,DQ Issue In-Vitro Opportunity related (Flag),"(""Decision Date Wrong (Flag)=='Y')||(""Delivery...",knime://knime-server/0014/CRM%20Data%20Cloud_D...
4,Primary_Winning_Vendor_Unknown,Primary Winning Vendor Unknown (Flag),"SELECT \n\tacc.""Account Country"" AS ""Country"",...",
5,Shipment_Date_Wrong,Shipment Date Wrong (Flag),"NOT ""Expected Shipment Date"" IS NULL AND ""Expe...",
6,DQ_Issue_Funnel_Inconsistencies,DQ Issue Funnel Inconsistencies (Flag),"(""FC Inconsistency (Flag)""=='Y')||(""Sale Type ...",knime://knime-server/0014/CRM%20Data%20Cloud_D...
7,FC_Inconsistency,FC Inconsistency (Flag),"SELECT \n\tacc.""Account Country"" AS ""Country"",...",
8,DQ_Issue_In-Vivo_Opportunity_related,DQ Issue In-Vivo Opportunity related (Flag),"(""Decision Date Wrong (Flag)""=='Y')||\n(""Deliv...",knime://knime-server/0014/CRM%20Data%20Cloud_D...
9,DQ_Issue_In-Vivo_Funnel_relevant,DQ Issue In-Vivo Funnel relevant (Flag),"(""Expired (Flag)=='Y')||(""FC Inconsistency (Fl...",knime://knime-server/0014/CRM%20Data%20Cloud_D...


In [160]:
# Save updated ontology
onto.save(file=ontology_directory+'DQ Ontology NTRIPLES.owl',format='ntriples')