In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
SPARK_HOME = '/home/jose/Frameworks/spark-3.0.2-bin-hadoop2.7'

In [3]:
# import os
# os.environ['PYSPARK_SUBMIT_ARGS'] = 'pyspark-shell'

import findspark
findspark.init(SPARK_HOME)

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

config = SparkConf() \
                    .setMaster('local[*]') \
                    .setAppName('Spark Base') \
                    .setAll([('spark.executor.memory', '2G'),
                            ('spark.driver.memory', '2G'),
                            ('spark.driver.maxResultSize', '1G')])

sc = SparkContext(conf=config)
spark = SparkSession(sc)

spark

## UDF

In [4]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

slen = udf(lambda s: len(s), IntegerType())

@udf
def to_upper(s: str) -> str:
    if s is not None:
        return s.upper()


@udf(returnType=IntegerType())
def add_one(x: int) -> int:
    if x is not None:
        return x + 1

df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))
df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")).show()

+----------+--------------+------------+
|slen(name)|to_upper(name)|add_one(age)|
+----------+--------------+------------+
|         8|      JOHN DOE|          22|
+----------+--------------+------------+



In [8]:
from pyspark.sql.functions import array, explode, col
from pyspark.sql.types import StructField, StructType, StringType

schema = StructType([
    StructField("string", StringType()),
    StructField("number", IntegerType())])

@udf(returnType=schema)
def multiple_columns(s: str, n: int):
    return s.upper()*n,  n**2


df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))
df.withColumn("multiple", multiple_columns("name", "age")) \
  .withColumn("string", col("multiple.string")) \
  .show(truncate=40)

+---+--------+---+----------------------------------------+----------------------------------------+
| id|    name|age|                                multiple|                                  string|
+---+--------+---+----------------------------------------+----------------------------------------+
|  1|John Doe| 21|[JOHN DOEJOHN DOEJOHN DOEJOHN DOEJOHN...|JOHN DOEJOHN DOEJOHN DOEJOHN DOEJOHN ...|
+---+--------+---+----------------------------------------+----------------------------------------+

