In [7]:
import pandas as pd
from pyspark.sql.functions import udf
from pyspark.sql.types import LongType
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Practice").getOrCreate()

In [3]:
def squared(s): # write the function in python
  return s * s

spark.udf.register("squaredWithPython", squared) # register the python function to use in pyspark
squared_udf = udf(squared, LongType())           # call the udf spark function on the python function and give it a return type
                                                 # use this squared_udf function when you want to use it in spark
df = spark.read.csv('cars.csv', header=True, sep=";", inferSchema=True)
df.createOrReplaceTempView("test")
df = spark.table("test")

22/05/01 12:16:02 WARN SimpleFunctionRegistry: The function squaredwithpython replaced a previously registered function.


In [21]:
# df = spark.read.csv('cars.csv', header=True, sep=";", inferSchema=True)
spark.range(1, 20).createOrReplaceTempView("test")
df = spark.table("test")

In [22]:
df.show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+



In [24]:
df.select("id", squared_udf("id").alias("id_squared")).show(10)

+---+----------+
| id|id_squared|
+---+----------+
|  1|         1|
|  2|         4|
|  3|         9|
|  4|        16|
|  5|        25|
|  6|        36|
|  7|        49|
|  8|        64|
|  9|        81|
| 10|       100|
+---+----------+
only showing top 10 rows



In [32]:
@udf("long")   # you can also do this, which registers it, labels the return type, and allows you to call the function in spark by the python name
def squared_udf2(s):
  return s * s

df = spark.table("test")
df.select("id", squared_udf2("id").alias("id_squared")).show()

+---+----------+
| id|id_squared|
+---+----------+
|  1|         1|
|  2|         4|
|  3|         9|
|  4|        16|
|  5|        25|
|  6|        36|
|  7|        49|
|  8|        64|
|  9|        81|
| 10|       100|
| 11|       121|
| 12|       144|
| 13|       169|
| 14|       196|
| 15|       225|
| 16|       256|
| 17|       289|
| 18|       324|
| 19|       361|
+---+----------+



In [27]:
spark.sql("select id, squaredWithPython(id) as id_squared from test").show()

+---+----------+
| id|id_squared|
+---+----------+
|  1|         1|
|  2|         4|
|  3|         9|
|  4|        16|
|  5|        25|
|  6|        36|
|  7|        49|
|  8|        64|
|  9|        81|
| 10|       100|
| 11|       121|
| 12|       144|
| 13|       169|
| 14|       196|
| 15|       225|
| 16|       256|
| 17|       289|
| 18|       324|
| 19|       361|
+---+----------+



In [34]:
spark.sql("select id, squaredWithPython(id) as id_squared from test").explain()

== Physical Plan ==
*(2) Project [id#144L, pythonUDF0#254 AS id_squared#250]
+- BatchEvalPython [squaredWithPython(id#144L)], [pythonUDF0#254]
   +- *(1) Range (1, 20, step=1, splits=20)


