In [1]:
#we use the findspark library to locate spark on our local machine
import findspark
findspark.init(r'C:\spark\spark-3.5.0-bin-hadoop3')
import pyspark # only run this after findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType,StructType,StructField

spark = SparkSession.builder \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()

In [2]:
data = [
 ("James,,Smith",["Java","Scala","C++"],["Spark","Java"],"OH","CA"),
 ("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"],"NY","NJ"),
 ("Robert,,Williams",["CSharp","VB"],["Spark","Python"],"UT","NV")
]

schema = StructType([ 
    StructField("name",StringType(),True), 
    StructField("languagesAtSchool",ArrayType(StringType()),True), 
    StructField("languagesAtWork",ArrayType(StringType()),True), 
    StructField("currentState", StringType(), True), 
    StructField("previousState", StringType(), True) 
  ])

df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- languagesAtWork: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)
 |-- previousState: string (nullable = true)

+----------------+------------------+---------------+------------+-------------+
|            name| languagesAtSchool|languagesAtWork|currentState|previousState|
+----------------+------------------+---------------+------------+-------------+
|    James,,Smith|[Java, Scala, C++]|  [Spark, Java]|          OH|           CA|
|   Michael,Rose,|[Spark, Java, C++]|  [Spark, Java]|          NY|           NJ|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|          UT|           NV|
+----------------+------------------+---------------+------------+-------------+



df.select(df.name, explode(df.languagesAtSchool)):

This part of the code selects two columns from the DataFrame df: the "name" column and a new column generated by applying the explode function to the "languagesAtSchool" column.
The explode function is used to "explode" an array or a map column into multiple rows, with each row containing one element from the array or map.
In this case, explode(df.languagesAtSchool) takes the "languagesAtSchool" column, which is assumed to be an array or a list of values, and generates a new DataFrame where each row corresponds to a single element from the array, and the "name" column is duplicated for each element.
.show(): Finally, the show method is called on the resulting DataFrame to display its contents in the console.

So, the overall purpose of this code is to explode the "languagesAtSchool" array column into multiple rows, with each row containing one language from the array. The "name" column remains the same for each row, indicating the association of each language with a particular name from the original DataFrame df.

In [3]:
from pyspark.sql.functions import explode
df.select(df.name,explode(df.languagesAtSchool)).show()

+----------------+------+
|            name|   col|
+----------------+------+
|    James,,Smith|  Java|
|    James,,Smith| Scala|
|    James,,Smith|   C++|
|   Michael,Rose,| Spark|
|   Michael,Rose,|  Java|
|   Michael,Rose,|   C++|
|Robert,,Williams|CSharp|
|Robert,,Williams|    VB|
+----------------+------+



df.select(split(df.name, ",").alias("nameAsArray")):

This code selects the "name" column from the DataFrame df and applies the split function to it.
The split function is used to split a string column into an array of strings based on a specified delimiter, in this case, a comma (,).
The result of the split function is given the alias "nameAsArray," which means the resulting array will be stored in a new column called "nameAsArray."
.show(): Finally, the show method is called on the resulting DataFrame to display its contents in the console.

So, the overall purpose of this code is to take the "name" column, which contains comma-separated values, and split it into an array of strings. Each element of the resulting array represents a part of the original string that was separated by commas. This can be useful for breaking down a single column with delimited values into multiple columns or for further processing the data as arrays in PySpark.

In [4]:
from pyspark.sql.functions import split
df.select(split(df.name,",").alias("nameAsArray")).show()

+--------------------+
|         nameAsArray|
+--------------------+
|    [James, , Smith]|
|   [Michael, Rose, ]|
|[Robert, , Williams]|
+--------------------+



In [5]:
from pyspark.sql.functions import array
df.select(df.name,array(df.currentState,df.previousState).alias("States")).show()

+----------------+--------+
|            name|  States|
+----------------+--------+
|    James,,Smith|[OH, CA]|
|   Michael,Rose,|[NY, NJ]|
|Robert,,Williams|[UT, NV]|
+----------------+--------+



In [6]:
from pyspark.sql.functions import array_contains
df.select(df.name,array_contains(df.languagesAtSchool,"Java")
    .alias("array_contains")).show()

+----------------+--------------+
|            name|array_contains|
+----------------+--------------+
|    James,,Smith|          true|
|   Michael,Rose,|          true|
|Robert,,Williams|         false|
+----------------+--------------+

