In [1]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("MyApp") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.4.0") \
    .getOrCreate()

# create dataframe

In [14]:
data =(('Alicia','Joseph',['Java','Scala','Spark'],{'hair':'black','eye':'brown'}), \
('Robert','Gee',['Spark','Java'],{'hair':'brown','eye':None}), \
('Mike','Bianca',['CSharp',''],{'hair':'red','eye':''}), \
('John','Kumar',None,None), \
('Jeff','L',['1','2'],{}))
schema = ('FirstName','LastName','Languages','properties')
emp1 = spark.createDataFrame(data=data,schema=schema)

In [5]:
data=(('Robert',35,40,40),('Ram',31,33,29),('John',95,89,91))
schema = ('name','score1','score2','score3')
emp2= spark.createDataFrame(data=data, schema=schema)

In [29]:
emp3 = spark.createDataFrame(data=(
    ('John',[10,20,20],[25,11,10]),\
    ('Robert',[15,13,55],[5,None,29]),\
    ('James',[11,13,45],[5,89,79])\
    ),schema=('empName', 'score_arr1', 'score_arr2'))

In [15]:
emp1.show()

+---------+--------+--------------------+--------------------+
|FirstName|LastName|           Languages|          properties|
+---------+--------+--------------------+--------------------+
|   Alicia|  Joseph|[Java, Scala, Spark]|{eye -> brown, ha...|
|   Robert|     Gee|       [Spark, Java]|{eye -> NULL, hai...|
|     Mike|  Bianca|          [CSharp, ]|{eye -> , hair ->...|
|     John|   Kumar|                NULL|                NULL|
|     Jeff|       L|              [1, 2]|                  {}|
+---------+--------+--------------------+--------------------+



In [10]:
emp2.show()


+------+------+------+------+
|  name|score1|score2|score3|
+------+------+------+------+
|Robert|    35|    40|    40|
|   Ram|    31|    33|    29|
|  John|    95|    89|    91|
+------+------+------+------+



In [30]:
emp3.show()

+-------+------------+-------------+
|empName|  score_arr1|   score_arr2|
+-------+------------+-------------+
|   John|[10, 20, 20]| [25, 11, 10]|
| Robert|[15, 13, 55]|[5, NULL, 29]|
|  James|[11, 13, 45]|  [5, 89, 79]|
+-------+------------+-------------+



# array functions

### 01. size of array/map

In [18]:
from pyspark.sql.functions import size
emp1\
    .withColumn('# of langauge',size('languages'))\
    .withColumn('# of properties',size('properties'))\
    .show()

+---------+--------+--------------------+--------------------+-------------+---------------+
|FirstName|LastName|           Languages|          properties|# of langauge|# of properties|
+---------+--------+--------------------+--------------------+-------------+---------------+
|   Alicia|  Joseph|[Java, Scala, Spark]|{eye -> brown, ha...|            3|              2|
|   Robert|     Gee|       [Spark, Java]|{eye -> NULL, hai...|            2|              2|
|     Mike|  Bianca|          [CSharp, ]|{eye -> , hair ->...|            2|              2|
|     John|   Kumar|                NULL|                NULL|         NULL|           NULL|
|     Jeff|       L|              [1, 2]|                  {}|            2|              0|
+---------+--------+--------------------+--------------------+-------------+---------------+



### 02. element_at

In [20]:
from pyspark.sql.functions import element_at
emp1\
    .withColumn('langauge[2]',element_at('languages',2))\
    .withColumn('properties[eye]',element_at('properties','eye'))\
    .show()

+---------+--------+--------------------+--------------------+-----------+---------------+
|FirstName|LastName|           Languages|          properties|langauge[2]|properties[eye]|
+---------+--------+--------------------+--------------------+-----------+---------------+
|   Alicia|  Joseph|[Java, Scala, Spark]|{eye -> brown, ha...|      Scala|          brown|
|   Robert|     Gee|       [Spark, Java]|{eye -> NULL, hai...|       Java|           NULL|
|     Mike|  Bianca|          [CSharp, ]|{eye -> , hair ->...|           |               |
|     John|   Kumar|                NULL|                NULL|       NULL|           NULL|
|     Jeff|       L|              [1, 2]|                  {}|          2|           NULL|
+---------+--------+--------------------+--------------------+-----------+---------------+



### 03. Create array / struct

In [22]:
from pyspark.sql.functions import struct,array
emp2\
    .withColumn('struct of scores',struct('score1','score2','score3'))\
    .withColumn('array of scores',array('score1','score2','score3'))\
    .show()

+------+------+------+------+----------------+---------------+
|  name|score1|score2|score3|struct of scores|array of scores|
+------+------+------+------+----------------+---------------+
|Robert|    35|    40|    40|    {35, 40, 40}|   [35, 40, 40]|
|   Ram|    31|    33|    29|    {31, 33, 29}|   [31, 33, 29]|
|  John|    95|    89|    91|    {95, 89, 91}|   [95, 89, 91]|
+------+------+------+------+----------------+---------------+



### 04. array of max/min

In [31]:
from pyspark.sql.functions import array_max,array_min
emp3\
    .withColumn('max of array',array_max('score_arr1'))\
    .withColumn('min of array',array_min('score_arr1'))\
    .show()

+-------+------------+-------------+------------+------------+
|empName|  score_arr1|   score_arr2|max of array|min of array|
+-------+------------+-------------+------------+------------+
|   John|[10, 20, 20]| [25, 11, 10]|          20|          10|
| Robert|[15, 13, 55]|[5, NULL, 29]|          55|          13|
|  James|[11, 13, 45]|  [5, 89, 79]|          45|          11|
+-------+------------+-------------+------------+------------+



### 04. array_distinct

In [33]:
from pyspark.sql.functions import array_distinct
emp3\
    .withColumn('distinct of array',array_distinct('score_arr1'))\
    .show()

+-------+------------+-------------+-----------------+
|empName|  score_arr1|   score_arr2|distinct of array|
+-------+------------+-------------+-----------------+
|   John|[10, 20, 20]| [25, 11, 10]|         [10, 20]|
| Robert|[15, 13, 55]|[5, NULL, 29]|     [15, 13, 55]|
|  James|[11, 13, 45]|  [5, 89, 79]|     [11, 13, 45]|
+-------+------------+-------------+-----------------+



### 04.array_repeat

In [38]:
from pyspark.sql.functions import array_repeat
emp3\
    .withColumn('repeat of array',array_repeat('score_arr1',5))\
    .show(truncate=False)

+-------+------------+-------------+----------------------------------------------------------------------+
|empName|score_arr1  |score_arr2   |repeat of array                                                       |
+-------+------------+-------------+----------------------------------------------------------------------+
|John   |[10, 20, 20]|[25, 11, 10] |[[10, 20, 20], [10, 20, 20], [10, 20, 20], [10, 20, 20], [10, 20, 20]]|
|Robert |[15, 13, 55]|[5, NULL, 29]|[[15, 13, 55], [15, 13, 55], [15, 13, 55], [15, 13, 55], [15, 13, 55]]|
|James  |[11, 13, 45]|[5, 89, 79]  |[[11, 13, 45], [11, 13, 45], [11, 13, 45], [11, 13, 45], [11, 13, 45]]|
+-------+------------+-------------+----------------------------------------------------------------------+



### 05.slice

In [42]:
from pyspark.sql.functions import slice
emp3\
    .withColumn('slice of array',slice('score_arr1',1,2))\
    .show(truncate=False)

+-------+------------+-------------+--------------+
|empName|score_arr1  |score_arr2   |slice of array|
+-------+------------+-------------+--------------+
|John   |[10, 20, 20]|[25, 11, 10] |[10, 20]      |
|Robert |[15, 13, 55]|[5, NULL, 29]|[15, 13]      |
|James  |[11, 13, 45]|[5, 89, 79]  |[11, 13]      |
+-------+------------+-------------+--------------+



### 06. array position

In [43]:
from pyspark.sql.functions import array_position
emp3\
    .withColumn('position of element in array',array_position('score_arr1',10))\
    .show(truncate=False)

+-------+------------+-------------+----------------------------+
|empName|score_arr1  |score_arr2   |position of element in array|
+-------+------------+-------------+----------------------------+
|John   |[10, 20, 20]|[25, 11, 10] |1                           |
|Robert |[15, 13, 55]|[5, NULL, 29]|0                           |
|James  |[11, 13, 45]|[5, 89, 79]  |0                           |
+-------+------------+-------------+----------------------------+



### 07.sort_array

In [44]:
from pyspark.sql.functions import sort_array
emp3\
    .withColumn('asc sort',sort_array('score_arr1'))\
    .withColumn('desc sort',sort_array('score_arr1',asc=False))\
    .show(truncate=False)

+-------+------------+-------------+------------+------------+
|empName|score_arr1  |score_arr2   |asc sort    |desc sort   |
+-------+------------+-------------+------------+------------+
|John   |[10, 20, 20]|[25, 11, 10] |[10, 20, 20]|[20, 20, 10]|
|Robert |[15, 13, 55]|[5, NULL, 29]|[13, 15, 55]|[55, 15, 13]|
|James  |[11, 13, 45]|[5, 89, 79]  |[11, 13, 45]|[45, 13, 11]|
+-------+------------+-------------+------------+------------+



### 08.array_contains

In [46]:
from pyspark.sql.functions import array_contains
emp3\
    .withColumn('array contains element',array_contains('score_arr1',11))\
    .show(truncate=False)

+-------+------------+-------------+----------------------+
|empName|score_arr1  |score_arr2   |array contains element|
+-------+------------+-------------+----------------------+
|John   |[10, 20, 20]|[25, 11, 10] |false                 |
|Robert |[15, 13, 55]|[5, NULL, 29]|false                 |
|James  |[11, 13, 45]|[5, 89, 79]  |true                  |
+-------+------------+-------------+----------------------+



### 09.array_union

In [47]:
from pyspark.sql.functions import array_union
emp3\
    .withColumn('arrays union',array_union('score_arr1','score_arr2'))\
    .show(truncate=False)

+-------+------------+-------------+-------------------------+
|empName|score_arr1  |score_arr2   |arrays union             |
+-------+------------+-------------+-------------------------+
|John   |[10, 20, 20]|[25, 11, 10] |[10, 20, 25, 11]         |
|Robert |[15, 13, 55]|[5, NULL, 29]|[15, 13, 55, 5, NULL, 29]|
|James  |[11, 13, 45]|[5, 89, 79]  |[11, 13, 45, 5, 89, 79]  |
+-------+------------+-------------+-------------------------+



### 10.array_except

In [48]:
from pyspark.sql.functions import array_except
emp3\
    .withColumn('arrays excpet',array_except('score_arr1','score_arr2'))\
    .show(truncate=False)

+-------+------------+-------------+-------------+
|empName|score_arr1  |score_arr2   |arrays excpet|
+-------+------------+-------------+-------------+
|John   |[10, 20, 20]|[25, 11, 10] |[20]         |
|Robert |[15, 13, 55]|[5, NULL, 29]|[15, 13, 55] |
|James  |[11, 13, 45]|[5, 89, 79]  |[11, 13, 45] |
+-------+------------+-------------+-------------+



### 11.array_intersect

In [49]:
from pyspark.sql.functions import array_intersect
emp3\
    .withColumn('arrays intersect',array_intersect('score_arr1','score_arr2'))\
    .show(truncate=False)

+-------+------------+-------------+----------------+
|empName|score_arr1  |score_arr2   |arrays intersect|
+-------+------------+-------------+----------------+
|John   |[10, 20, 20]|[25, 11, 10] |[10]            |
|Robert |[15, 13, 55]|[5, NULL, 29]|[]              |
|James  |[11, 13, 45]|[5, 89, 79]  |[]              |
+-------+------------+-------------+----------------+



### 12.array_join

In [51]:
from pyspark.sql.functions import array_join #-> convert to string same like '_'.join(l1)
emp3\
    .withColumn('arrays join',array_join('score_arr1','+'))\
    .show(truncate=False)

+-------+------------+-------------+-----------+
|empName|score_arr1  |score_arr2   |arrays join|
+-------+------------+-------------+-----------+
|John   |[10, 20, 20]|[25, 11, 10] |10+20+20   |
|Robert |[15, 13, 55]|[5, NULL, 29]|15+13+55   |
|James  |[11, 13, 45]|[5, 89, 79]  |11+13+45   |
+-------+------------+-------------+-----------+



### 13.arrays_zip

In [54]:
from pyspark.sql.functions import arrays_zip
emp3\
    .withColumn('arrays zip',arrays_zip('score_arr1','score_arr2'))\
    .show(truncate=False)

+-------+------------+-------------+-------------------------------+
|empName|score_arr1  |score_arr2   |arrays zip                     |
+-------+------------+-------------+-------------------------------+
|John   |[10, 20, 20]|[25, 11, 10] |[{10, 25}, {20, 11}, {20, 10}] |
|Robert |[15, 13, 55]|[5, NULL, 29]|[{15, 5}, {13, NULL}, {55, 29}]|
|James  |[11, 13, 45]|[5, 89, 79]  |[{11, 5}, {13, 89}, {45, 79}]  |
+-------+------------+-------------+-------------------------------+



### 14.arrays_overlap

In [56]:
from pyspark.sql.functions import arrays_overlap
emp3\
    .withColumn('arrays overlap',arrays_overlap('score_arr1','score_arr2'))\
    .show(truncate=False)

+-------+------------+-------------+--------------+
|empName|score_arr1  |score_arr2   |arrays overlap|
+-------+------------+-------------+--------------+
|John   |[10, 20, 20]|[25, 11, 10] |true          |
|Robert |[15, 13, 55]|[5, NULL, 29]|NULL          |
|James  |[11, 13, 45]|[5, 89, 79]  |false         |
+-------+------------+-------------+--------------+



### 15. shuffle

In [57]:
from pyspark.sql.functions import shuffle
emp3\
    .withColumn('arrays shuffle',shuffle('score_arr1'))\
    .show(truncate=False)

+-------+------------+-------------+--------------+
|empName|score_arr1  |score_arr2   |arrays shuffle|
+-------+------------+-------------+--------------+
|John   |[10, 20, 20]|[25, 11, 10] |[20, 20, 10]  |
|Robert |[15, 13, 55]|[5, NULL, 29]|[15, 55, 13]  |
|James  |[11, 13, 45]|[5, 89, 79]  |[13, 11, 45]  |
+-------+------------+-------------+--------------+



# MAP functions

### 01.create map

In [58]:
from pyspark.sql.functions import create_map
emp1\
    .withColumn('create map',create_map('FirstName','LastName'))\
    .show()

+---------+--------+--------------------+--------------------+------------------+
|FirstName|LastName|           Languages|          properties|        create map|
+---------+--------+--------------------+--------------------+------------------+
|   Alicia|  Joseph|[Java, Scala, Spark]|{eye -> brown, ha...|{Alicia -> Joseph}|
|   Robert|     Gee|       [Spark, Java]|{eye -> NULL, hai...|   {Robert -> Gee}|
|     Mike|  Bianca|          [CSharp, ]|{eye -> , hair ->...|  {Mike -> Bianca}|
|     John|   Kumar|                NULL|                NULL|   {John -> Kumar}|
|     Jeff|       L|              [1, 2]|                  {}|       {Jeff -> L}|
+---------+--------+--------------------+--------------------+------------------+



### 02. create map from arrays

In [63]:
spark.conf.set("spark.sql.mapKeyDedupPolicy", "LAST_WIN")
from pyspark.sql.functions import map_from_arrays,array_distinct
emp3\
    .withColumn('create map',map_from_arrays('score_arr1','score_arr1'))\
    .show(truncate=False)

+-------+------------+-------------+------------------------------+
|empName|score_arr1  |score_arr2   |create map                    |
+-------+------------+-------------+------------------------------+
|John   |[10, 20, 20]|[25, 11, 10] |{10 -> 10, 20 -> 20}          |
|Robert |[15, 13, 55]|[5, NULL, 29]|{15 -> 15, 13 -> 13, 55 -> 55}|
|James  |[11, 13, 45]|[5, 89, 79]  |{11 -> 11, 13 -> 13, 45 -> 45}|
+-------+------------+-------------+------------------------------+



### 03.map keys

In [64]:
from pyspark.sql.functions import map_keys
emp1\
    .withColumn('get keys of map',map_keys('properties'))\
    .show()

+---------+--------+--------------------+--------------------+---------------+
|FirstName|LastName|           Languages|          properties|get keys of map|
+---------+--------+--------------------+--------------------+---------------+
|   Alicia|  Joseph|[Java, Scala, Spark]|{eye -> brown, ha...|    [eye, hair]|
|   Robert|     Gee|       [Spark, Java]|{eye -> NULL, hai...|    [eye, hair]|
|     Mike|  Bianca|          [CSharp, ]|{eye -> , hair ->...|    [eye, hair]|
|     John|   Kumar|                NULL|                NULL|           NULL|
|     Jeff|       L|              [1, 2]|                  {}|             []|
+---------+--------+--------------------+--------------------+---------------+



### 04.map_values()

In [65]:
from pyspark.sql.functions import map_values
emp1\
    .withColumn('get values of map',map_values('properties'))\
    .show()

+---------+--------+--------------------+--------------------+-----------------+
|FirstName|LastName|           Languages|          properties|get values of map|
+---------+--------+--------------------+--------------------+-----------------+
|   Alicia|  Joseph|[Java, Scala, Spark]|{eye -> brown, ha...|   [brown, black]|
|   Robert|     Gee|       [Spark, Java]|{eye -> NULL, hai...|    [NULL, brown]|
|     Mike|  Bianca|          [CSharp, ]|{eye -> , hair ->...|          [, red]|
|     John|   Kumar|                NULL|                NULL|             NULL|
|     Jeff|       L|              [1, 2]|                  {}|               []|
+---------+--------+--------------------+--------------------+-----------------+



### 05.map_concat

In [67]:
from pyspark.sql.functions import map_concat
emp1\
    .withColumn('map_concat',map_concat('properties','properties'))\
    .show(truncate=False)

+---------+--------+--------------------+-----------------------------+-----------------------------+
|FirstName|LastName|Languages           |properties                   |map_concat                   |
+---------+--------+--------------------+-----------------------------+-----------------------------+
|Alicia   |Joseph  |[Java, Scala, Spark]|{eye -> brown, hair -> black}|{eye -> brown, hair -> black}|
|Robert   |Gee     |[Spark, Java]       |{eye -> NULL, hair -> brown} |{eye -> NULL, hair -> brown} |
|Mike     |Bianca  |[CSharp, ]          |{eye -> , hair -> red}       |{eye -> , hair -> red}       |
|John     |Kumar   |NULL                |NULL                         |NULL                         |
|Jeff     |L       |[1, 2]              |{}                           |{}                           |
+---------+--------+--------------------+-----------------------------+-----------------------------+



In [68]:
spark.stop()