In [None]:
from pyapacheatlas.auth import ServicePrincipalAuthentication
from pyapacheatlas.core import PurviewClient
from pyapacheatlas.core.util import GuidTracker
from pyapacheatlas.core import AtlasEntity, AtlasProcess
from dotenv import load_dotenv
import os

load_dotenv()

tenant_id = os.environ.get("TENANT_ID") 
client_id = os.environ.get("CLIENT_ID")
client_secret = os.environ.get("CLIENT_SECRET")
account_name = os.environ.get("PURVIEW_ACCOUNT")

auth = ServicePrincipalAuthentication(
    tenant_id = tenant_id, 
    client_id = client_id, 
    client_secret = client_secret
)

# Create a client to connect to your service.
client = PurviewClient(
    account_name = account_name,
    authentication = auth
)


In [None]:
client.get_all_typedefs()

In [None]:
from pyspark.sql import DataFrame, SparkSession


spark = SparkSession.builder.appName("test").getOrCreate()

columns = ["language","users_count"]
data = [("Java", 20000), ("Python", 100000), ("Scala", 3000)]

df = spark.createDataFrame(data).toDF(*columns)
df.show()

In [None]:
def register_df(df: DataFrame, name: str, qualified_name: str):
    colEntities = []
    guid = 100

    ts = AtlasEntity(
            name="demoDFSchema",
            typeName="tabular_schema",
            qualified_name=f"{qualified_name}_tabular_schema",
            guid = -guid
        )

    for (col, type) in df.dtypes:
        guid +=1
        colEntities.append(
            AtlasEntity(
                name=col,
                typeName="column",
                qualified_name=f"{qualified_name}_column_{col}",
                guid= -guid,
                attributes={
                    "type": type,
                    "description": f"Column {col} has type {type}"
                },
                relationshipAttributes = {
                    "composeSchema": ts.to_json(minimum=True)
                }
            )
        )

    rs = AtlasEntity(
            name=name,
            typeName="azure_datalake_gen2_resource_set",
            qualified_name=qualified_name,
            guid = -(guid+1),
            relationshipAttributes = {
                "tabular_schema": ts.to_json(minimum=True)
            }
        )

    return (rs, client.upload_entities([ts, rs, *colEntities]))

In [None]:
rs, response = register_df(df=df, name="testDF", qualified_name="pyapache://testDF")

In [None]:
from purviewatlaspoc.common.utils import PurviewPOCClient

client = PurviewPOCClient(account_name=account_name, authentication=auth)
client.create_delta_table_typedefs()

In [None]:
from purviewatlaspoc.common.utils import PurviewPOCClient

client = PurviewPOCClient(account_name=account_name, authentication=auth)

client.register_df(df, "input1", "pyapache://input1DF")
client.register_df(df.alias("input2"), "input2", "pyapache://input2DF")
client.register_df(df.alias("output1"), "output1", "pyapache://output1DF")

process = AtlasProcess(
  name="test_spark_job",
  qualified_name = "pyapacheatlas://test_spark_job",
  typeName="custom_spark_job_process",
  guid=-20,
  attributes = {"job_type":"join"},
  inputs = [client.get_minimal_rep("pyapache://input1DF"), client.get_minimal_rep("pyapache://input2DF")],
  outputs = [client.get_minimal_rep("pyapache://output1DF")]
)

client.upload_entities([process])

In [None]:
def get_minimal_rep(qualifiedName:str, typeName: str = "azure_datalake_gen2_resource_set"):
    return {
        "guid": client.get_entity(qualifiedName=qualifiedName, typeName=typeName)["entities"][0]["guid"],
        "typeName": typeName,
        "qualifiedName": qualifiedName
    }

In [None]:
get_minimal_rep(qualifiedName="pyapache://input1DF", typeName="azure_datalake_gen2_resource_set")

In [None]:
client.get_all_typedefs()

In [None]:
client.delete_entity("20f09e4e-ffe4-b226-8b9c-2c591ea752f7")

In [None]:
client.get_relationship(guid="20f09e4e-ffe4-b226-8b9c-2c591ea752f7")