In [0]:
import pandas as pd 
from pyspark.sql.functions import pandas_udf, col, PandasUDFType
from pyspark.sql.types import IntegerType
from typing import Iterator, Tuple

In [0]:
# make sure that com.azure.cosmos.spark additional library is being installed on cluster as in tutorial
# note that it's not safe to store credentials (endpoint, key) in code, as below. It's recomended to create secrets in Azure Key Vault instead

cosmosEndpoint = "https://my-cosmos-db-account.documents.azure.com:443/"
cosmosMasterKey = "FyumkXMjoTgoB9mEXHGhweChrVgRe9As4gd6TtN17c5e4tlGWy18e9NjegAm3L07ceVE6BJ9KTymACDbJUvXIQ=="
cosmosDatabaseName = "superstore_db"
cosmosContainerName = "superstore-container"

cfg = {
    "spark.cosmos.accountEndpoint": cosmosEndpoint,
    "spark.cosmos.accountKey": cosmosMasterKey,
    "spark.cosmos.database": cosmosDatabaseName,
    "spark.cosmos.container": cosmosContainerName
}

In [0]:
spark.conf.set("spark.sql.catalog.cosmosCatalog", "com.azure.cosmos.spark.CosmosCatalog")
spark.conf.set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountEndpoint", cosmosEndpoint)
spark.conf.set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountKey", cosmosMasterKey)

In [0]:
from pyspark.sql.functions import col
superstore_df = spark.read.format("cosmos.oltp").options(**cfg)\
                          .option("spark.cosmos.read.inferSchema.enabled", "true")\
                          .load()

In [0]:
# create vectorized UDF

def year(date: pd.Series) -> pd.Series:
    return pd.to_datetime(date).dt.year

year_pandas = pandas_udf(year, returnType=IntegerType())

In [0]:
superstore_df.withColumn('Year', year_pandas(col('Order Date')))\
             .select('Order Date', 'Category', 'Year').display()

Order Date,Category,Year
7/16/2017,Furniture,2017
9/17/2015,Office Supplies,2015
7/17/2016,Office Supplies,2016
9/19/2017,Office Supplies,2017
10/20/2014,Office Supplies,2014
4/18/2015,Furniture,2015
6/4/2016,Office Supplies,2016
11/26/2014,Furniture,2014
6/12/2016,Office Supplies,2016
6/12/2016,Furniture,2016


In [0]:
# it is preffered to use anotations to create vectorized (same as usual) UDFs
# annotation "year(date: pd.Series) -> pd.Series:" gives python hints what type of data in/out

@pandas_udf('integer', PandasUDFType.SCALAR)
def year(date: pd.Series) -> pd.Series:
    return pd.to_datetime(date).dt.year

@pandas_udf('integer', PandasUDFType.SCALAR)
def month(date: pd.Series) -> pd.Series:
    return pd.to_datetime(date).dt.month

@pandas_udf('integer', PandasUDFType.SCALAR)
def day(date: pd.Series) -> pd.Series:
    return pd.to_datetime(date).dt.day

In [0]:
superstore_df.withColumn('Year', year(col('Order Date')))\
             .withColumn('Month', month(col('Order Date')))\
             .withColumn('Day', day(col('Order Date')))\
             .select('Category', 'Order Date', 'Year', 'Month', 'Day').display()

Category,Order Date,Year,Month,Day
Furniture,7/16/2017,2017,7,16
Office Supplies,9/17/2015,2015,9,17
Office Supplies,7/17/2016,2016,7,17
Office Supplies,9/19/2017,2017,9,19
Office Supplies,10/20/2014,2014,10,20
Furniture,4/18/2015,2015,4,18
Office Supplies,6/4/2016,2016,6,4
Furniture,11/26/2014,2014,11,26
Office Supplies,6/12/2016,2016,6,12
Furniture,6/12/2016,2016,6,12


In [0]:
@pandas_udf('first string, last string')
def firstname_lastname(name: pd.Series) ->pd.Series:
    return name.str.split(expand=True)

In [0]:
superstore_df.select(firstname_lastname('Customer Name')).display()

firstname_lastname(Customer Name)
"List(Sandra, Flanagan)"
"List(Tracy, Blumstein)"
"List(Ruben, Ausman)"
"List(Erin, Smith)"
"List(Patrick, O'Donnell)"
"List(Darren, Powers)"
"List(Karen, Daniels)"
"List(Joel, Eaton)"
"List(Stewart, Carmichael)"
"List(Steven, Cartwright)"


UDFs with iterator of Series to iterator of Series

In [0]:
def expensive_operation_to_compute_discount():
    return 0.05

In [0]:
@pandas_udf("float")
def compute_discounted_sales_price(iterator: Iterator[pd.Series]) ->Iterator[pd.Series]:
    
    discount = expensive_operation_to_compute_discount()
    
    for sales_price in iterator:
        yield sales_price - discount * sales_price

In [0]:
superstore_df.select('Product Name', 'Sales', 
                    compute_discounted_sales_price('Sales').alias('Discounted Sales')).display()

Product Name,Sales,Discounted Sales
"Global Deluxe Stacking Chair, Gray",71.372,67.8034
"Acco Pressboard Covers with Storage Hooks, 14 7/8"" x 11"", Executive Red",6.858,6.5151
"Eldon Base for stackable storage shelf, platinum",77.88,73.986
"Advantus 10-Drawer Portable Organizer, Chrome Metal Frame, Smoke Drawers",95.616,90.8352
"Gould Plastics 9-Pocket Panel Bin, 18-3/8w x 5-1/4d x 20-1/2h, Black",211.96,201.362
Longer-Life Soft White Bulbs,6.16,5.852
"Snap-A-Way Black Print Carbonless Ruled Speed Letter, Triplicate",75.88,72.086
"Eldon Expressions Desk Accessory, Wood Pencil Holder, Oak",19.3,18.335
Avery Heavy-Duty EZD Binder With Locking Rings,16.74,15.903
Artistic Insta-Plaque,47.04,44.688


UDFs iterator of multiple series to iterator of series

In [0]:
@pandas_udf("string")
def combine_city_state(iterator: Iterator[Tuple[pd.Series, pd.Series]]) -> Iterator[pd.Series]:
    for city, state in iterator:
        yield city + ' (' + state + ')'

In [0]:
superstore_df.select('City', 'State', combine_city_state('City', 'State').alias('City (State)')).display()

City,State,City (State)
Philadelphia,Pennsylvania,Philadelphia (Pennsylvania)
Philadelphia,Pennsylvania,Philadelphia (Pennsylvania)
Los Angeles,California,Los Angeles (California)
Melbourne,Florida,Melbourne (Florida)
Westland,Michigan,Westland (Michigan)
New Albany,Indiana,New Albany (Indiana)
Springfield,Virginia,Springfield (Virginia)
Houston,Texas,Houston (Texas)
Decatur,Alabama,Decatur (Alabama)
Wilmington,Delaware,Wilmington (Delaware)


In [0]:
@pandas_udf("float")
def compute_cost(iterator: Iterator[Tuple[pd.Series, pd.Series]]) -> Iterator[pd.Series]:
    for sales, profit in iterator:
        yield sales - profit

In [0]:
superstore_df.select('Product Name', 'Sales', 'Profit', compute_cost('Sales', 'Profit').alias('Cost')).display()

Product Name,Sales,Profit,Cost
"Global Deluxe Stacking Chair, Gray",71.372,-1.0196,72.3916
"Acco Pressboard Covers with Storage Hooks, 14 7/8"" x 11"", Executive Red",6.858,-5.715,12.573
"Eldon Base for stackable storage shelf, platinum",77.88,3.894,73.986
"Advantus 10-Drawer Portable Organizer, Chrome Metal Frame, Smoke Drawers",95.616,9.5616,86.0544
"Gould Plastics 9-Pocket Panel Bin, 18-3/8w x 5-1/4d x 20-1/2h, Black",211.96,8.4784,203.4816
Longer-Life Soft White Bulbs,6.16,2.9568,3.2032
"Snap-A-Way Black Print Carbonless Ruled Speed Letter, Triplicate",75.88,35.6636,40.2164
"Eldon Expressions Desk Accessory, Wood Pencil Holder, Oak",19.3,-14.475,33.775
Avery Heavy-Duty EZD Binder With Locking Rings,16.74,8.0352,8.7048
Artistic Insta-Plaque,47.04,18.3456,28.6944


In [0]:
# use within the SQL queries as usual UDFs

spark.udf.register('compute_cost', compute_cost)
superstore_df.createOrReplaceTempView('superstore_data')
spark.sql("""select 'Product Name', Sales, Profit, compute_cost(Sales, Profit) as Cost
            from superstore_data""").display()

Product Name,Sales,Profit,Cost
Product Name,71.372,-1.0196,72.3916
Product Name,6.858,-5.715,12.573
Product Name,77.88,3.894,73.986
Product Name,95.616,9.5616,86.0544
Product Name,211.96,8.4784,203.4816
Product Name,6.16,2.9568,3.2032
Product Name,75.88,35.6636,40.2164
Product Name,19.3,-14.475,33.775
Product Name,16.74,8.0352,8.7048
Product Name,47.04,18.3456,28.6944


Series to Scalar UDFs

In [0]:
@pandas_udf('float')
def average(values: pd.Series) -> float:
    return values.mean()

In [0]:
superstore_df.select(average('Sales').alias('Average Sales')).display()

Average Sales
229.858


In [0]:
@pandas_udf('float')
def median(values: pd.Series) -> float:
    return values.median()

In [0]:
superstore_df.select(median('Sales').alias('Median Sales')).display()

Median Sales
54.49


In [0]:
#use in SQL

spark.udf.register('average', average)
spark.sql('select average(profit) from superstore_data').display()

average(profit)
28.656897


In [0]:
spark.sql('select category, average(profit) from superstore_data group by category').display()

category,average(profit)
Furniture,8.699327
Office Supplies,20.32705
Technology,78.752
