In [1]:
#we use the findspark library to locate spark on our local machine
import findspark
findspark.init(r'C:\spark\spark-3.5.0-bin-hadoop3')
import pyspark # only run this after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()

In [2]:
columns = ["name","languagesAtSchool","currentState"]
data = [("James,,Smith",["Java","Scala","C++"],"CA"), \
    ("Michael,Rose,",["Spark","Java","C++"],"NJ"), \
    ("Robert,,Williams",["CSharp","VB"],"NV")]

df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)

+----------------+------------------+------------+
|name            |languagesAtSchool |currentState|
+----------------+------------------+------------+
|James,,Smith    |[Java, Scala, C++]|CA          |
|Michael,Rose,   |[Spark, Java, C++]|NJ          |
|Robert,,Williams|[CSharp, VB]      |NV          |
+----------------+------------------+------------+




df.withColumn("languagesAtSchool", concat_ws(",", col("languagesAtSchool"))):
It creates a new DataFrame df2 by adding a new column called "languagesAtSchool."
The values for the new column "languagesAtSchool" are generated using the concat_ws function.
concat_ws stands for "concatenate with separator." It is used to concatenate the values of one or more columns into a single column, separated by a specified delimiter (in this case, a comma ",").
col("languagesAtSchool") selects the "languagesAtSchool" column from the original DataFrame df.
So, the code takes the values in the "languagesAtSchool" column from the original DataFrame df, concatenates them into a single string with comma separators, and assigns this concatenated string as the values for the new "languagesAtSchool" column in the DataFrame df2. This can be useful for formatting or reshaping data within your DataFrame.

In [3]:
from pyspark.sql.functions import col, concat_ws
df2 = df.withColumn("languagesAtSchool",
   concat_ws(",",col("languagesAtSchool")))
df2.printSchema()
df2.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: string (nullable = false)
 |-- currentState: string (nullable = true)

+----------------+-----------------+------------+
|name            |languagesAtSchool|currentState|
+----------------+-----------------+------------+
|James,,Smith    |Java,Scala,C++   |CA          |
|Michael,Rose,   |Spark,Java,C++   |NJ          |
|Robert,,Williams|CSharp,VB        |NV          |
+----------------+-----------------+------------+



In [4]:
df.createOrReplaceTempView("ARRAY_STRING")
spark.sql("select name, concat_ws(',',languagesAtSchool) as languagesAtSchool,currentState from ARRAY_STRING").show(truncate=False)

+----------------+-----------------+------------+
|name            |languagesAtSchool|currentState|
+----------------+-----------------+------------+
|James,,Smith    |Java,Scala,C++   |CA          |
|Michael,Rose,   |Spark,Java,C++   |NJ          |
|Robert,,Williams|CSharp,VB        |NV          |
+----------------+-----------------+------------+

