In [None]:
%pip install semantic-link==0.11.1 semantic-link-labs==0.11.2

In [None]:
#initialization of variables 

# these can be edited as per your need
# steps are skipped automatically if the corresponding item already exists
# lakehouse and notebook names cannot be reused if those items were recently deleted so that would be one reason to give a new name
lakehouse_name = "cms_lakehouse"
semantic_model_name = "cms_semantic_model" 
datafactory_pipeline_name = "cms_pipeline"
report_name = "cms_report"

download_cmsdata_notebook_import_name = "01-DownloadCMSDataCsvFiles"
create_data_table_notebook_import_name = "02-CreateCMSDataTable"
create_starschema_table_notebook_import_name = "03-CreateCMSStarSchemaTables"

invoke_datafactory_pipeline_step = True

In [None]:
import requests
import zipfile
import os
import sempy_labs as labs
import sempy.fabric as semfabric
import base64
import json

from urllib.parse import urlparse

#initialization of additional variables which shouldn't be edited unless you know what you are doing

#base directory is created in the lakehouse as part of the demo setup
base_dir_relative_path = "Files/cmsdemofiles"

#external links
#artifact zip file from Github with definition files for Data Factory Pipeline, Power BI Semantic Model and Report
artifactzip_github_url = "https://github.com/isinghrana/fabric-samples-healthcare/raw/refs/heads/isr-auto1/analytics-bi-directlake-starschema/demoautomation/artifacts.zip"

#github urls for notebook import
download_cmsdata_notebook_github_url = "https://raw.githubusercontent.com/isinghrana/fabric-samples-healthcare/refs/heads/isr-auto1/analytics-bi-directlake-starschema/01-DownloadCMSDataCsvFiles.ipynb"
create_data_table_notebook_github_url = "https://raw.githubusercontent.com/isinghrana/fabric-samples-healthcare/refs/heads/isr-auto1/analytics-bi-directlake-starschema/02-CreateCMSDataTable.ipynb"
create_starschema_table_notebook_github_url = "https://raw.githubusercontent.com/isinghrana/fabric-samples-healthcare/refs/heads/isr-auto1/analytics-bi-directlake-starschema/03-CreateCMSStarSchemaTables.ipynb"

#data factory definition files are extracted from artifact zip file and paths are relative to the base_dir_relative_path
datafactory_pipeline_jsonfile_relativepath = "/cms_pipeline.DataPipeline/pipeline-content.json"
datafactory_platform_file_relativepath = "/cms_pipeline.DataPipeline/.platform"

#these are fixed constant values from Data Factory Pipeline definition file
#DO NOT UPDATE unless the pipeline definition file is updated
datafactory_pipeline_downloadcmsdataset_notebookactivityname =  "DownloadCMSDataset"
datafactory_pipeline_createcmsdataflattable_notebookactivityname = "CreateCMSDataFlatTable"
datafactory_pipeline_createcmsstarschematables_notebookactivityname = "CreateCMSStarSchemaTables"

#semantic model definition files are extracted from artifact zip file and paths are relative to the base dir
semanticmodel_relative_path = "/CMS_Direct_Lake_Star_Schema.SemanticModel"
report_relative_path = "/CMS Medicare Part D Star Schema.Report"


In [None]:
lakehouse_exists = any(item['displayName'] == lakehouse_name for item in notebookutils.lakehouse.list())

if (lakehouse_exists):
    #lakehouse already exist so save the Id
    print(f'Lakehouse exists so using the existing lakehouse : {lakehouse_name}')
    lakehouse_id = notebookutils.lakehouse.get(lakehouse_name)['id']    
else:
    #create lakehouse as it does not exist
    print(f'Creating lakehouse : {lakehouse_name}')
    lakehouse = notebookutils.lakehouse.create(lakehouse_name)    
    lakehouse_id = lakehouse['id']
    
workspace_id = notebookutils.runtime.context["currentWorkspaceId"]                                  

#directory initialization
base_dir_full_path = f"abfss://{workspace_id}@onelake.dfs.fabric.microsoft.com/{lakehouse_id}/{base_dir_relative_path}"
notebookutils.fs.mkdirs(base_dir_full_path)
    
mount_point = "/mnt/lakehouse/" + lakehouse_name + "/" + base_dir_relative_path
print(f'base_dir full: {base_dir_full_path}, mount_point: {mount_point}')

notebookutils.fs.mount(base_dir_full_path, mount_point)
base_dir_local_path = notebookutils.fs.getMountPath(mount_point)

In [None]:
#common utility functions

def get_file_contents(local_file_path):
    with open(local_file_path, "r", encoding="utf-8") as file:
        file_content = file.read()
    return file_content

#function is used in steps to import semantic model and report
#input arugment is folder with definition files
#directory and subdirectories are walked through and a dictionary returned where key is the part path and value is the content of the file
def get_fabricitemdef_partdict(definitionfiles_local_path) -> dict[str,str]:

    def_dict = {}

    for root, dirs, files in os.walk(definitionfiles_local_path):
        #print(f'Current directory: {root}')
        for file in files:
            #print(f'  File: {file}')
            part_key = root.replace(definitionfiles_local_path, "") + "/" + file
            part_key = part_key.lstrip('/')
            #print(f'part_key: {part_key}')

            with open( root + "/" + file, "r", encoding="utf-8") as file:
                payload = file.read()
                def_dict[part_key] = payload

    return def_dict    

def fabriclient_post(url, request_body):

    client = semfabric.FabricRestClient()
    #print(create_datafactory_pipeline_request_body)
    response = client.request(method = "POST", path_or_url=url, lro_wait=True, json = request_body)
    print(response.status_code)
    print(response.text)
    response.raise_for_status()  # Raise an error for bad status codes   

# used for data factory pipeline, semantic model and report
# each of these artifacts have .platform file with name of the artifact 
# and this function updates the displayName attribute
def update_displayname_platformfile(json_str, display_name) -> str:

    json_data = json.loads(json_str)
    json_data['metadata']['displayName'] = display_name

    updated_json_str = json.dumps(json_data, indent=4)
    #print(updated_json_str)
    return updated_json_str


def item_exists(item_name, item_type) -> bool:

    items_df = semfabric.list_items(item_type)

    if item_name in items_df['Display Name'].values:
        print(f'{item_name} of type {item_type} exists')
        return True
    else:
        print(f'{item_name} of type {item_type} does not exist')
        return False
    

In [None]:
#download artifacts zip file - Data Factory Pipeline, Semantic Model and REport files from GitHub which be used to create corresponding Fabric Items

#function used to download artifact zip file
def download_binary_file(url, output_path):
    try:        
        response = requests.get(url=url, stream = True)
        
        response.raise_for_status()  # Raise an error for bad status codes
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"File downloaded successfully to: {output_path}")
    except requests.exceptions.RequestException as e:
        print(f"Download failed: {e}")
        raise RuntimeError(f"Failed to download file from {url}") from e


def unzip_file(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Extracted all contents to '{extract_to}'")

artifact_filename = urlparse(artifactzip_github_url).path.split('/')[-1]

if notebookutils.fs.exists(base_dir_full_path + "/" + artifact_filename):
    print (f"{base_dir_full_path}/{artifact_filename} already exists so skipping artifact zip file download step")    
else:        
    # Extract the path and get the last part    
    download_path = base_dir_local_path + '/' + artifact_filename
    print(f'downloading artifacts zip from - {artifactzip_github_url} to location {download_path}')
    download_binary_file(artifactzip_github_url, download_path)
    print('artifact file downloaded successfully and now unzipping the artifact file')
    unzip_file(download_path, base_dir_local_path)


In [None]:
#import notebooks

#if notebook was recently deleted, name is not be reusable right away so either wait for 5 to 10 minutes or set a different import name for the notebook in the settings above
def import_notebook(notebook_import_name, githuburl, workspace_id, lakehouse_name) -> str:
    #import notebook and return notebookid
    
    if item_exists(notebook_import_name, "Notebook"):
        print(f'{notebook_import_name} already exists so skipping import')
    else:
        print(f'{notebook_import_name} does not exist so importing from {githuburl}')
        result = labs.import_notebook_from_web(notebook_name = notebook_import_name, url = githuburl)        
        
        #update the default lakehouse        
        notebookutils.notebook.updateDefinition(name = notebook_import_name, workspaceId = workspace_id, defaultLakehouse = lakehouse_name, defaultLakehouseWorkspace= workspace_id)
    
    notebook_id = semfabric.resolve_item_id(item_name = notebook_import_name, type = "Notebook")
    print(f"notebookname: {notebook_import_name}, notebook_id: {notebook_id}")
    return notebook_id


#import notebooks and get Notebook Ids for all 3 notebooks to be used in subsequent steps
download_cmsdata_notebook_id = import_notebook(download_cmsdata_notebook_import_name, download_cmsdata_notebook_github_url, workspace_id, lakehouse_name)
create_data_table_notebook_id = import_notebook(create_data_table_notebook_import_name, create_data_table_notebook_github_url, workspace_id, lakehouse_name)
create_starschema_table_notebook_id = import_notebook(create_starschema_table_notebook_import_name, create_starschema_table_notebook_github_url, workspace_id, lakehouse_name)

In [None]:
#import data factory pipeline

def update_pipeline_json(json_str, workspace_id, pipelineactivity_notebook_mapping) -> str:
    
    data = json.loads(json_str)    

    for notebook_activity_name, notebook_id in pipelineactivity_notebook_mapping.items():
        
        for activity in data['properties']["activities"]:
            
            if activity.get("name") == notebook_activity_name:
                print(f'Replacing {notebook_activity_name} with {notebook_id}')
                                            
                activity["typeProperties"]["workspaceId"] = workspace_id
                activity["typeProperties"]["notebookId"] = notebook_id

    updated_json_str = json.dumps(data, indent=4)
    #print(updated_json_str)
    return updated_json_str


if item_exists(datafactory_pipeline_name, "DataPipeline"):
    print(f'{datafactory_pipeline_name} exists so skipping the step')
else:
    print(f'{datafactory_pipeline_name} does not exist so creating the pipeline')

    datafactory_pipeline_jsonfile_local_path = base_dir_local_path + datafactory_pipeline_jsonfile_relativepath
    datafactory_platform_file_local_path = base_dir_local_path + datafactory_platform_file_relativepath

    #read file contents
    platform_file_payload =  get_file_contents(datafactory_platform_file_local_path)
    pipeline_json_payload =  get_file_contents(datafactory_pipeline_jsonfile_local_path)

    #pipeline defintion has Json nodes for Notebook Activities, need to update the JSON with appropriate NotebookId from this workspace
    notebookmapping_dict = {
        datafactory_pipeline_downloadcmsdataset_notebookactivityname : download_cmsdata_notebook_id,
        datafactory_pipeline_createcmsdataflattable_notebookactivityname : create_data_table_notebook_id,
        datafactory_pipeline_createcmsstarschematables_notebookactivityname: create_starschema_table_notebook_id    
    }

    #workspace id and notebook ids need to be updated/replaced from the origin pipeline definition json
    pipeline_json_payload = update_pipeline_json(pipeline_json_payload, workspace_id, notebookmapping_dict)

    platform_file_payload = update_displayname_platformfile(platform_file_payload, datafactory_pipeline_name)

    #create post request body
    create_datafactory_pipeline_request_body = {
        "displayName": datafactory_pipeline_name,
        "description": "cms_pipeline to ingest and process data",
        "definition" : {
            "parts": [
                {
                    "path": "pipeline-content.json",
                    "payload": base64.b64encode(pipeline_json_payload.encode('utf-8')),
                    "payloadType": "InlineBase64"
                },
                {
                    "path": ".platform",
                    "payload": base64.b64encode(platform_file_payload.encode('utf-8')),
                    "payloadType": "InlineBase64"
                }
            ]
        }
    }

    create_pipeline_uri = f"v1/workspaces/{workspace_id}/dataPipelines"
    client = semfabric.FabricRestClient()
  
    #print(create_datafactory_pipeline_request_body)
    create_datafactory_pipeline_response = client.request(method = "POST", path_or_url=create_pipeline_uri, lro_wait=True, json = create_datafactory_pipeline_request_body)
    print(create_datafactory_pipeline_response.status_code)
    print(create_datafactory_pipeline_response.text)
    create_datafactory_pipeline_response.raise_for_status()  # Raise an error for bad status codes   

In [None]:
#import semantic model

if item_exists(semantic_model_name, "SemanticModel"):
    print(f'Semantic Model {semantic_model_name} already exists so skipping the step')
else:    

    create_semantic_model_uri = f"v1/workspaces/{workspace_id}/semanticModels"

    #start with body which will get populated using the model defintion 
    create_semantic_model_request_body = {
        "displayName": semantic_model_name,
        "description": "cms semantic model created using API",
        "definition" : {
            "parts": []
            }
        }

    #read the semantic model definition folder into a dictionary to be used to be populate the request body for API Post call 
    semanticmodel_local_path = base_dir_local_path + semanticmodel_relative_path
    print(f'semantic model definition files path: {semanticmodel_local_path}')

    semantic_model_part_dict = get_fabricitemdef_partdict(semanticmodel_local_path)   
   
    #populate the request body using dictionary
    for key, value in semantic_model_part_dict.items():        

        if ".platform" in key:
            value = update_displayname_platformfile(value, semantic_model_name)

        new_part = {
            "path": key,
            "payload" : base64.b64encode(value.encode('utf-8')),
            "payloadType": "inlineBase64"
        }

        create_semantic_model_request_body["definition"]["parts"].append(new_part)
   
    fabriclient_post(create_semantic_model_uri, create_semantic_model_request_body)   

    print('Semantic Model created successfully and updating the semantic model to point to lakehouse in this workspace')
    
    #update the semantic model to point to lakehouse in this workspace
    labs.directlake.update_direct_lake_model_lakehouse_connection(
        dataset = semantic_model_name,
        lakehouse =  lakehouse_name
    )

In [None]:
#import report

def update_reportdef_semanticmodelid(report_def_str, id) -> str:

    report_def_json = json.loads(report_def_str)

    # Replace the pbiModelDatabaseName value    
    report_def_json["datasetReference"]["byConnection"]["pbiModelDatabaseName"] = id
    # Convert back to JSON string
    updated_json_str = json.dumps(report_def_json, indent=4)
    #print(updated_json_str)
    return updated_json_str

if item_exists(report_name, "Report"):
    print(f'Report {report_name} alerady exists so skipping the step')
else:    
    
    #need to get semantic model id because report definition.pbir file needs to be updated with the semantic model craeted as part of the setup
    #in this workspace
    semantic_model_id = semfabric.resolve_item_id(semantic_model_name, type = "SemanticModel")
    create_report_uri = f"v1/workspaces/{workspace_id}/reports"

    #start with body which will get populated using the model defintion 
    create_report_request_body = {
        "displayName": report_name,
        "description": "report created using API",
        "definition" : {
            "parts": []
            }
        }

    #read the semantic model definition folder into a dictionary to be used to be populate the request body for API Post call 
    report_local_path = base_dir_local_path + report_relative_path
    print(f'report definition files path: {report_local_path}')

    report_part_dict = get_fabricitemdef_partdict(report_local_path)   
    
    #populate the request body using dictionary
    for key, value in report_part_dict.items():              

        if ("definition.pbir" in key):
            value = update_reportdef_semanticmodelid(value, semantic_model_id)        
            #print(f'Updated definition json: {value}')
        elif (".platform" in key):
            value = update_displayname_platformfile(value, report_name)

        new_part = {
            "path": key,
            "payload" : base64.b64encode(value.encode('utf-8')),
            "payloadType": "inlineBase64"
        }           
    
        create_report_request_body["definition"]["parts"].append(new_part)
                
    fabriclient_post(create_report_uri, create_report_request_body)
    print('report created successfully')
    #labs.report.report_rebind(report=report_name,dataset=semantic_model_name)  
   

In [None]:
#invoke Data Factory Pipeline to load data to Lakehouse

if not invoke_datafactory_pipeline_step:
    print('Skipping invocation of Data Factory Pipeline setp')
else:
    datafactory_pipeline_id = semfabric.resolve_item_id(datafactory_pipeline_name, type = "DataPipeline")
    print(datafactory_pipeline_id)

    url = f"v1/workspaces/{workspace_id}/items/{datafactory_pipeline_id}/jobs/instances?jobType=Pipeline"

    client = semfabric.FabricRestClient()
    response = client.request(method = "POST", path_or_url=url)
    print(response.status_code)
    print(response.text)
    response.raise_for_status()  # Raise an error for bad status codes   

    print("Data Factory Pipeline Job submitted successfully - monitor Pipeline Run from Monitoring Hub or open the pipeline then use Run > View Run History menu to actively monitor the pipeline. Once pipeline job complete data is available in Lakehouse for querying and reporting")