In [1]:
# Example notebook code from:
# https://docs.microsoft.com/en-us/azure/databricks/_static/notebooks/getting-started/popvspricelr.html
# https://docs.microsoft.com/en-us/azure/databricks/getting-started/spark/machine-learning

import azureml.core
from azureml.core.authentication import ServicePrincipalAuthentication
from azureml.core import Workspace, Dataset

print("SDK version:", azureml.core.VERSION)

keyVaultScope = "databricks-aml-demo"

service_principal_id = dbutils.secrets.get(keyVaultScope, "databricks-aml-demo-sp-client-id")
service_principal_password = dbutils.secrets.get(keyVaultScope, "databricks-aml-demo-sp-client-key")
tenant_id = dbutils.secrets.get(keyVaultScope, "azure-tenant-id")

# AML Workspace
workspace_name = "amls-databricks"
subscription_id = dbutils.secrets.get(keyVaultScope, "azure-subscription-id")
resource_group = "jp-databricks"

svc_pr = ServicePrincipalAuthentication(
    service_principal_id=service_principal_id,
    service_principal_password=service_principal_password,
    tenant_id=tenant_id
    )

ws = Workspace(workspace_name=workspace_name,
               subscription_id=subscription_id,
               resource_group=resource_group,
               auth=svc_pr)

print("Found workspace {} at location {}".format(ws.name, ws.location))

In [2]:
# Get default datastore
datastore = ws.get_default_datastore()

data_geo = Dataset.get_by_name(ws, name='data_geo')
data = data_geo.to_spark_dataframe()
data.cache()

# display(data)

In [3]:
# Data cleaning
# Drop rows with missing values, replace column headings
from pyspark.sql.functions import col

exprs = [col(column).alias(column.replace(' ', '_')) for column in data.columns]
data = data.dropna().select(*exprs) 

In [4]:
# Write output to datastore
data.write.parquet(f"/mnt/{datastore.container_name}/population-vs-price/data-geo-prepped/", mode="overwrite")