In [None]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("User Defined Functions")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/11 16:44:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
emp = (
    spark.read.format("csv")
    .option("header", True)
    .option("inferSchema", True)
    .load("hdfs://namenode:9000/input/data/employee_records.csv")
)

                                                                                

In [None]:
from pyspark.sql import functions as F
import math
import numpy as np
import pandas as pd


def math_complex_operation(dept_id):
    return math.floor(math.sqrt(dept_id) + 1)


def np_complex_operation(dept_id: pd.Series) -> pd.Series:
    return np.floor(np.sqrt(dept_id) + 1)


python_complex_operation = F.udf(math_complex_operation, "float")
pandas_complex_operation = F.pandas_udf(np_complex_operation, "float")
spark_complex_operation = F.floor(F.sqrt(F.col("department_id") + 1))

In [4]:
# Register as UDF
from pyspark.sql import functions as F

emp_python = emp.withColumn("calc", python_complex_operation(F.col("department_id")))
emp_pandas = emp.withColumn("calc", pandas_complex_operation(F.col("department_id")))
emp_spark = emp.withColumn("calc", spark_complex_operation)

In [None]:
emp_python.write.format("noop").mode("overwrite").save()
emp_pandas.write.format("noop").mode("overwrite").save()
emp_spark.write.format("noop").mode("overwrite").save()

                                                                                

In [None]:
import joblib


def row_wise_predict(dept_id):
    return int(model_bc.value.predict([[dept_id]])[0])


def vectorized_predict(dept_id: pd.Series) -> pd.Series:
    return pd.Series(model_bc.value.predict(dept_id.values.reshape(-1, 1)))


model = joblib.load("lr.joblib")
model_bc = spark.sparkContext.broadcast(model)
python_predict = F.udf(row_wise_predict, "int")
pandas_predict = F.pandas_udf(vectorized_predict, "int")

In [7]:
emp_python = emp.withColumn("pred", python_predict(F.col("department_id")))
emp_pandas = emp.withColumn("pred", pandas_predict(F.col("department_id")))

In [8]:
emp_python.write.format("noop").mode("overwrite").save()
emp_pandas.write.format("noop").mode("overwrite").save()

                                                                                

In [9]:
# Stop Spark Session

spark.stop()