In [4]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName("arrayTypeFunctions").getOrCreate()

In [3]:
data = [(1,'kunj',['machine learning','spark','sql','de','azure']),
       (2,'Raman',['accounting','ca,maths','calculator,manager'])]

columns = ['id','name','skills']
df = spark.createDataFrame(data=data,schema=columns)

df.show()

df.printSchema()

+---+-----+--------------------+
| id| name|              skills|
+---+-----+--------------------+
|  1| kunj|[machine learning...|
|  2|Raman|[accounting, ca, ...|
+---+-----+--------------------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [6]:
#explode uses to explode a column
df2 = df.withColumn('skill',explode(df.skills))
df2.show()


+---+-----+--------------------+----------------+
| id| name|              skills|           skill|
+---+-----+--------------------+----------------+
|  1| kunj|[machine learning...|machine learning|
|  1| kunj|[machine learning...|           spark|
|  1| kunj|[machine learning...|             sql|
|  1| kunj|[machine learning...|              de|
|  1| kunj|[machine learning...|           azure|
|  2|Raman|[accounting, ca, ...|      accounting|
|  2|Raman|[accounting, ca, ...|              ca|
|  2|Raman|[accounting, ca, ...|           maths|
|  2|Raman|[accounting, ca, ...|      calculator|
|  2|Raman|[accounting, ca, ...|         manager|
+---+-----+--------------------+----------------+



In [14]:
#split uses to split the values according to delimeter
data = [(1,'kunj','spark,sql'),
       (2,'Raman','accounting,ca')]

columns = ['id','name','skills']
df = spark.createDataFrame(data=data,schema=columns)

df.show()

df.printSchema()

+---+-----+-------------+
| id| name|       skills|
+---+-----+-------------+
|  1| kunj|    spark,sql|
|  2|Raman|accounting,ca|
+---+-----+-------------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: string (nullable = true)



In [15]:
df2 = df.withColumn('skill',split(col('skills'),','))
df2.show()

+---+-----+-------------+----------------+
| id| name|       skills|           skill|
+---+-----+-------------+----------------+
|  1| kunj|    spark,sql|    [spark, sql]|
|  2|Raman|accounting,ca|[accounting, ca]|
+---+-----+-------------+----------------+



In [16]:
data = [(1,'kunj','spark','sql'),
       (2,'Raman','accounting','ca')]

columns = ['id','name','primskills','secskills']
df = spark.createDataFrame(data=data,schema=columns)

df.show()

df.printSchema()

+---+-----+----------+---------+
| id| name|primskills|secskills|
+---+-----+----------+---------+
|  1| kunj|     spark|      sql|
|  2|Raman|accounting|       ca|
+---+-----+----------+---------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- primskills: string (nullable = true)
 |-- secskills: string (nullable = true)



In [17]:
df.withColumn('skillsArray',array(df.primskills,df.secskills)).show()

+---+-----+----------+---------+----------------+
| id| name|primskills|secskills|     skillsArray|
+---+-----+----------+---------+----------------+
|  1| kunj|     spark|      sql|    [spark, sql]|
|  2|Raman|accounting|       ca|[accounting, ca]|
+---+-----+----------+---------+----------------+



In [18]:
data = [(1,'kunj',['machine learning','spark','sql','de','azure']),
       (2,'Raman',['accounting','ca,maths','calculator,manager'])]

columns = ['id','name','skills']
df = spark.createDataFrame(data=data,schema=columns)

df.show()

df.printSchema()

+---+-----+--------------------+
| id| name|              skills|
+---+-----+--------------------+
|  1| kunj|[machine learning...|
|  2|Raman|[accounting, ca,m...|
+---+-----+--------------------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [19]:
#tells if the value exist in the column or not
df1 = df.withColumn('HasSparkSkill',array_contains(df.skills,'spark'))
df1.show()

+---+-----+--------------------+-------------+
| id| name|              skills|HasSparkSkill|
+---+-----+--------------------+-------------+
|  1| kunj|[machine learning...|         true|
|  2|Raman|[accounting, ca,m...|        false|
+---+-----+--------------------+-------------+

