In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('class').getOrCreate()

In [6]:
from pyspark.sql import Row
data = [Row(name='James', prop=Row(hair='black', eye = 'blue')),
       Row(name='Ann', prop=Row(hair='grey', eye='black'))]
df=spark.createDataFrame(data)
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- prop: struct (nullable = true)
 |    |-- hair: string (nullable = true)
 |    |-- eye: string (nullable = true)

+-----+-------------+
| name|         prop|
+-----+-------------+
|James|{black, blue}|
|  Ann|{grey, black}|
+-----+-------------+



In [9]:
from pyspark.sql.functions import *
df.select(df.prop).show()
df.select(df['prop']).show()
df.select(col('prop.hair')).show()

df.select(col('prop.*')).show()

+-------------+
|         prop|
+-------------+
|{black, blue}|
|{grey, black}|
+-------------+

+-------------+
|         prop|
+-------------+
|{black, blue}|
|{grey, black}|
+-------------+

+-----+
| hair|
+-----+
|black|
| grey|
+-----+

+-----+-----+
| hair|  eye|
+-----+-----+
|black| blue|
| grey|black|
+-----+-----+



In [10]:
data=[("James","Bond","100",None),
      ("Ann","Varsa","200",'F'),
      ("Tom Cruise","XXX","400",''),
      ("Tom Brand",None,"400",'M')] 
columns=["fname","lname","id","gender"]
df=spark.createDataFrame(data,columns)
df.show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|     James| Bond|100|  NULL|
|       Ann|Varsa|200|     F|
|Tom Cruise|  XXX|400|      |
| Tom Brand| NULL|400|     M|
+----------+-----+---+------+



In [11]:
df.select(df.fname.alias('first_name'),  df.lname.alias('last_name')).show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|     James|     Bond|
|       Ann|    Varsa|
|Tom Cruise|      XXX|
| Tom Brand|     NULL|
+----------+---------+



In [12]:
df.select(expr(" fname ||','|| lname").alias('fullName')).show()

+--------------+
|      fullName|
+--------------+
|    James,Bond|
|     Ann,Varsa|
|Tom Cruise,XXX|
|          NULL|
+--------------+



In [13]:
df.sort(df.fname.asc()).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|       Ann|Varsa|200|     F|
|     James| Bond|100|  NULL|
| Tom Brand| NULL|400|     M|
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



In [14]:
df.sort(df.fname.desc()).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
| Tom Brand| NULL|400|     M|
|     James| Bond|100|  NULL|
|       Ann|Varsa|200|     F|
+----------+-----+---+------+



In [16]:
#cast (to convert dtype)
df.printSchema()
df.select(df.fname, df.id.cast('int')).printSchema()

root
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)

root
 |-- fname: string (nullable = true)
 |-- id: integer (nullable = true)



In [17]:
# between 
df.filter(df.id.between(100, 300)).show()

+-----+-----+---+------+
|fname|lname| id|gender|
+-----+-----+---+------+
|James| Bond|100|  NULL|
|  Ann|Varsa|200|     F|
+-----+-----+---+------+



In [18]:
# contains
df.filter(df.fname.contains('Cruise')).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



In [19]:
# Startswith, endswith()
df.filter(df.fname.startswith('T')).show()
df.filter(df.fname.endswith('Cruise')).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
| Tom Brand| NULL|400|     M|
+----------+-----+---+------+

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



In [20]:
# isNull & isNotNull

df.filter(df.lname.isNull()).show()
df.filter(df.lname.isNotNull()).show()

+---------+-----+---+------+
|    fname|lname| id|gender|
+---------+-----+---+------+
|Tom Brand| NULL|400|     M|
+---------+-----+---+------+

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|     James| Bond|100|  NULL|
|       Ann|Varsa|200|     F|
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



In [24]:
df.show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|     James| Bond|100|  NULL|
|       Ann|Varsa|200|     F|
|Tom Cruise|  XXX|400|      |
| Tom Brand| NULL|400|     M|
+----------+-----+---+------+



In [26]:
# like, rlike

df.select(df.fname, df.lname, df.id).filter(df.fname.like("%om")).show()

+-----+-----+---+
|fname|lname| id|
+-----+-----+---+
+-----+-----+---+



In [23]:
df.select(df.fname.substr(1,2).alias('substr')).show()

+------+
|substr|
+------+
|    Ja|
|    An|
|    To|
|    To|
+------+



In [27]:
df.select(df.fname, df.lname, when(df.gender == "M","Male")\
         .when(df.gender=='F', "Female")\
         .when(df.gender==None, "")\
         .otherwise(df.gender).alias("new_gender")\
         ).show()

+----------+-----+----------+
|     fname|lname|new_gender|
+----------+-----+----------+
|     James| Bond|      NULL|
|       Ann|Varsa|    Female|
|Tom Cruise|  XXX|          |
| Tom Brand| NULL|      Male|
+----------+-----+----------+



In [29]:
# isin

li =['100', '200']
df.select(df.fname, df.lname, df.id).filter(df.id.isin(li)).show()

+-----+-----+---+
|fname|lname| id|
+-----+-----+---+
|James| Bond|100|
|  Ann|Varsa|200|
+-----+-----+---+



In [33]:
from pyspark.sql.types import StructType,StructField,StringType,ArrayType,MapType
data=[(("James","Bond"),["Java","C#"],{'hair':'black','eye':'brown'}),
      (("Ann","Varsa"),[".NET","Python"],{'hair':'brown','eye':'black'}),
      (("Tom Cruise",""),["Python","Scala"],{'hair':'red','eye':'grey'}),
      (("Tom Brand",None),["Perl","Ruby"],{'hair':'black','eye':'blue'})]

schema = StructType([
          StructField('name', StructType([StructField('fname', StringType(), True),StructField('lname', StringType(), True)])),
            StructField('languages', ArrayType(StringType()),True),
            StructField('properties', MapType(StringType(),StringType()),True)
     ])
df=spark.createDataFrame(data,schema)
df.printSchema()
df.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- fname: string (nullable = true)
 |    |-- lname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+-----------------+---------------+-----------------------------+
|name             |languages      |properties                   |
+-----------------+---------------+-----------------------------+
|{James, Bond}    |[Java, C#]     |{eye -> brown, hair -> black}|
|{Ann, Varsa}     |[.NET, Python] |{eye -> black, hair -> brown}|
|{Tom Cruise, }   |[Python, Scala]|{eye -> grey, hair -> red}   |
|{Tom Brand, NULL}|[Perl, Ruby]   |{eye -> blue, hair -> black} |
+-----------------+---------------+-----------------------------+



In [31]:
# getField from MapType 

df.select(df.properties.getField('hair')).show()

# getField from struct
df.select(df.name.getField('fname')).show()

+----------------+
|properties[hair]|
+----------------+
|           black|
|           brown|
|             red|
|           black|
+----------------+

+----------+
|name.fname|
+----------+
|     James|
|       Ann|
|Tom Cruise|
| Tom Brand|
+----------+



In [34]:
# getItem() used with ArrayType

df.select(df.languages.getItem(1)).show()

# getItem() used with MapType
df.select(df.properties.getItem('hair')).show()

+------------+
|languages[1]|
+------------+
|          C#|
|      Python|
|       Scala|
|        Ruby|
+------------+

+----------------+
|properties[hair]|
+----------------+
|           black|
|           brown|
|             red|
|           black|
+----------------+



In [35]:
spark.stop()