In [11]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local") \
    .appName("SparkByExamples.com") \
    .getOrCreate()

filePath="small_zipcode.csv"
df = spark.read.options(header='true', inferSchema='true').csv(filePath)

df.printSchema()
df.toPandas()

root
 |-- id: integer (nullable = true)
 |-- zipcode: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- population: integer (nullable = true)



Unnamed: 0,id,zipcode,type,city,state,population
0,1,704,STANDARD,,PR,30100.0
1,2,704,,PASEO COSTA DEL SUR,PR,
2,3,709,,BDA SAN LUIS,PR,3700.0
3,4,76166,UNIQUE,CINGULAR WIRELESS,TX,84000.0
4,5,76177,STANDARD,,TX,


In [14]:
df.na.drop().toPandas()

Unnamed: 0,id,zipcode,type,city,state,population
0,4,76166,UNIQUE,CINGULAR WIRELESS,TX,84000


In [22]:
#df.na.fill('value').toPandas()
df.fillna('value').toPandas()

Unnamed: 0,id,zipcode,type,city,state,population
0,1,704,STANDARD,value,PR,30100.0
1,2,704,value,PASEO COSTA DEL SUR,PR,
2,3,709,value,BDA SAN LUIS,PR,3700.0
3,4,76166,UNIQUE,CINGULAR WIRELESS,TX,84000.0
4,5,76177,STANDARD,value,TX,


In [20]:
df.na.replace(to_replace='value',value='NewValue').toPandas()

Unnamed: 0,id,zipcode,type,city,state,population
0,1,704,STANDARD,,PR,30100.0
1,2,704,,PASEO COSTA DEL SUR,PR,
2,3,709,,BDA SAN LUIS,PR,3700.0
3,4,76166,UNIQUE,CINGULAR WIRELESS,TX,84000.0
4,5,76177,STANDARD,,TX,


In [27]:
data = [
 ("James,,Smith",["Java","Scala","C++"],["Spark","Java"],"OH","CA"),
 ("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"],"NY","NJ"),
 ("Robert,,Williams",["CSharp","VB"],["Spark","Python"],"UT","NV")
]

from pyspark.sql.types import StringType, ArrayType,StructType,StructField
schema = StructType([ 
    StructField("name",StringType(),True), 
    StructField("languagesAtSchool",ArrayType(StringType()),True), 
    StructField("languagesAtWork",ArrayType(StringType()),True), 
    StructField("currentState", StringType(), True), 
    StructField("previousState", StringType(), True)
  ])

df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.toPandas()

root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- languagesAtWork: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)
 |-- previousState: string (nullable = true)



Unnamed: 0,name,languagesAtSchool,languagesAtWork,currentState,previousState
0,"James,,Smith","[Java, Scala, C++]","[Spark, Java]",OH,CA
1,"Michael,Rose,","[Spark, Java, C++]","[Spark, Java]",NY,NJ
2,"Robert,,Williams","[CSharp, VB]","[Spark, Python]",UT,NV


In [28]:
from pyspark.sql import functions as F


In [37]:
df.select(F.split(F.col('name'), "," ).alias('Name'), F.explode('languagesAtSchool')).toPandas()

Unnamed: 0,Name,col
0,"[James, , Smith]",Java
1,"[James, , Smith]",Scala
2,"[James, , Smith]",C++
3,"[Michael, Rose, ]",Spark
4,"[Michael, Rose, ]",Java
5,"[Michael, Rose, ]",C++
6,"[Robert, , Williams]",CSharp
7,"[Robert, , Williams]",VB


In [48]:
df.select( 'name', F.array(F.col('currentState'),F.col('previousState'))).toPandas()

Unnamed: 0,name,"array(currentState, previousState)"
0,"James,,Smith","[OH, CA]"
1,"Michael,Rose,","[NY, NJ]"
2,"Robert,,Williams","[UT, NV]"


In [49]:
df.select(df.name,F.array_contains(df.languagesAtSchool,"Java")
    .alias("array_contains")).show()

+----------------+--------------+
|            name|array_contains|
+----------------+--------------+
|    James,,Smith|          true|
|   Michael,Rose,|          true|
|Robert,,Williams|         false|
+----------------+--------------+

