- Author: Ben Du
- Date: 2020-09-05 14:56:47
- Title: Collection Functions in Spark
- Slug: pyspark-func-collection
- Category: Computer Science
- Tags: programming, Spark, DataFrame, collection, Spark SQL, functions, modulus, func, fun
- Modified: 2020-09-05 14:56:47


## Tips and Traps

1. When converting a pandas DataFrame to a Spark DataFrame, 
    the `list` type is converted to `ArratyType`
    while the `tuple` type is converted to `StructType`.

In [1]:
from typing import List, Tuple
import pandas as pd

In [27]:
from pathlib import Path
import findspark
#findspark.init(str(next(Path("/opt").glob("spark-3*"))))
findspark.init("/opt/spark-3.0.1-bin-hadoop3.2/")
#findspark.init("/opt/spark-2.3.0-bin-hadoop2.7")

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, StringType, StructType, StructField, ArrayType

spark = SparkSession.builder.appName("PySpark_Str_Func") \
    .enableHiveSupport().getOrCreate()

## ArrayType

In [3]:
df = spark.createDataFrame(
    pd.DataFrame(
        data=[([1, 2], "how", 1), ([2, 3], "are", 2), ([3, 4], "you", 3)],
        columns=["col1", "col2", "col3"]
    )
)
df.show()

+------+----+----+
|  col1|col2|col3|
+------+----+----+
|[1, 2]| how|   1|
|[2, 3]| are|   2|
|[3, 4]| you|   3|
+------+----+----+



In [4]:
df.select(element_at(col("col1"), 1).alias("word")).show()

+----+
|word|
+----+
|   1|
|   2|
|   3|
+----+



In [5]:
df.select(element_at("col1", 1).alias("word")).show()

+----+
|word|
+----+
|   1|
|   2|
|   3|
+----+



In [10]:
@udf(ArrayType(IntegerType()))
def my_udf(x: int) -> List:
    return [x, 1]

In [11]:
df1 = df.select(my_udf("col3").alias("f1"))
df1.show()

+------+
|    f1|
+------+
|[1, 1]|
|[2, 1]|
|[3, 1]|
+------+



In [12]:
df1.schema

StructType(List(StructField(f1,ArrayType(IntegerType,true),true)))

In [13]:
df1.select(element_at("f1", 1).alias("v1"), element_at("f1", 2).alias("v2")).show()

+---+---+
| v1| v2|
+---+---+
|  1|  1|
|  2|  1|
|  3|  1|
+---+---+



## StructType

In [41]:
df2 = spark.createDataFrame(
    pd.DataFrame(
        data=[((1, 2), "how", 1), ((2, 3), "are", 2), ((3, 4), "you", 3)],
        columns=["col1", "col2", "col3"]
    )
)
df2.show()

+------+----+----+
|  col1|col2|col3|
+------+----+----+
|[1, 2]| how|   1|
|[2, 3]| are|   2|
|[3, 4]| you|   3|
+------+----+----+



In [42]:
df2.schema

StructType(List(StructField(col1,StructType(List(StructField(_1,LongType,true),StructField(_2,LongType,true))),true),StructField(col2,StringType,true),StructField(col3,LongType,true)))

In [43]:
df2.select("col1.*").show()

+---+---+
| _1| _2|
+---+---+
|  1|  2|
|  2|  3|
|  3|  4|
+---+---+



In [44]:
@udf(
    StructType(
        [
            StructField("_1", IntegerType(), nullable=True),
            StructField("_2", IntegerType(), nullable=True)
        ]
    )
)
def my_udf2(x: int) -> Tuple:
    return (x, 1)

In [45]:
df3 = df.select(my_udf2("col3").alias("f1"))
df3.show()

+------+
|    f1|
+------+
|[1, 1]|
|[2, 1]|
|[3, 1]|
+------+



In [46]:
df3.schema

StructType(List(StructField(f1,StructType(List(StructField(_1,IntegerType,true),StructField(_2,IntegerType,true))),true)))

In [47]:
df3.select("f1.*").show()

+---+---+
| _1| _2|
+---+---+
|  1|  1|
|  2|  1|
|  3|  1|
+---+---+



In [48]:
df3.select(col("f1._1").alias("v1"), col("f1._2").alias("v2")).show()

+---+---+
| v1| v2|
+---+---+
|  1|  1|
|  2|  1|
|  3|  1|
+---+---+



## References 

https://obstkel.com/spark-sql-functions

https://spark.apache.org/docs/latest/api/java/index.html?org/apache/spark/sql/functions.html

https://stackoverflow.com/questions/36840563/how-to-return-a-tuple-type-in-a-udf-in-pyspark