In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
data=[("James","Bond","100",None),
      ("Ann","Varsa","200",'F'),
      ("Tom Cruise","XXX","400",''),
      ("Tom Brand",None,"400",'M')] 
columns=["fname","lname","id","gender"]
df=spark.createDataFrame(data,columns)
df.show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|     James| Bond|100|  null|
|       Ann|Varsa|200|     F|
|Tom Cruise|  XXX|400|      |
| Tom Brand| null|400|     M|
+----------+-----+---+------+



## alias usage

In [0]:
df1 = df.select(
        df['fname'].alias('FirstName'),
        df.lname.alias('LastName'),
        col('id').alias('Identity'),
        'gender')
df1.show()

+----------+--------+--------+------+
| FirstName|LastName|Identity|gender|
+----------+--------+--------+------+
|     James|    Bond|     100|  null|
|       Ann|   Varsa|     200|     F|
|Tom Cruise|     XXX|     400|      |
| Tom Brand|    null|     400|     M|
+----------+--------+--------+------+



## column concatination

In [0]:
df1.selectExpr("*", "concat(FirstName, ' ', LastName) as FullName").show()

+----------+--------+--------+------+--------------+
| FirstName|LastName|Identity|gender|      FullName|
+----------+--------+--------+------+--------------+
|     James|    Bond|     100|  null|    James Bond|
|       Ann|   Varsa|     200|     F|     Ann Varsa|
|Tom Cruise|     XXX|     400|      |Tom Cruise XXX|
| Tom Brand|    null|     400|     M|          null|
+----------+--------+--------+------+--------------+



In [0]:
df2 = df1.select(
            expr("FirstName || ' ' || LastName").alias('FullName'),\
            df1['Identity'].alias('ID'),
            'Gender').sort(df1.LastName.desc(),
                           df1.Identity.asc())
df2.show()
df2.printSchema()

+--------------+---+------+
|      FullName| ID|Gender|
+--------------+---+------+
|Tom Cruise XXX|400|      |
|     Ann Varsa|200|     F|
|    James Bond|100|  null|
|          null|400|     M|
+--------------+---+------+

root
 |-- FullName: string (nullable = true)
 |-- ID: string (nullable = true)
 |-- Gender: string (nullable = true)



***Tom Brand appears an NULL i.e. fname + null = null***

## cast() & astype()

In [0]:
df2.select('FullName', df2.ID.cast('int')).printSchema()
# df2.select('FullName', df2.ID.cast(IntegerType())).printSchema()

root
 |-- FullName: string (nullable = true)
 |-- ID: integer (nullable = true)



## between()

In [0]:
df2.filter(df2.ID.between(100,300)).show()

+----------+---+------+
|  FullName| ID|Gender|
+----------+---+------+
| Ann Varsa|200|     F|
|James Bond|100|  null|
+----------+---+------+



## contains()

In [0]:
df2.filter(df2.FullName.contains('An')).show()

+---------+---+------+
| FullName| ID|Gender|
+---------+---+------+
|Ann Varsa|200|     F|
+---------+---+------+



## startswith() & endswith()

In [0]:
df2.filter(df2['FullName'].startswith('T')).show()
df2.filter(df2['FullName'].endswith('ond')).show()

+--------------+---+------+
|      FullName| ID|Gender|
+--------------+---+------+
|Tom Cruise XXX|400|      |
+--------------+---+------+

+----------+---+------+
|  FullName| ID|Gender|
+----------+---+------+
|James Bond|100|  null|
+----------+---+------+



## isNull() & isNotNull()

In [0]:
df2.filter(df2.Gender.isNull()).show()
df2.filter(df2.Gender.isNotNull()).show()

+----------+---+------+
|  FullName| ID|Gender|
+----------+---+------+
|James Bond|100|  null|
+----------+---+------+

+--------------+---+------+
|      FullName| ID|Gender|
+--------------+---+------+
|Tom Cruise XXX|400|      |
|     Ann Varsa|200|     F|
|          null|400|     M|
+--------------+---+------+



## like()

In [0]:
df2.filter(col('FullName').like('__m%')).show()   ## matching 3rd letter from starting
df2.filter(col('FullName').like('%B_nd')).show()  ## matching last letters

+--------------+---+------+
|      FullName| ID|Gender|
+--------------+---+------+
|Tom Cruise XXX|400|      |
|    James Bond|100|  null|
+--------------+---+------+

+----------+---+------+
|  FullName| ID|Gender|
+----------+---+------+
|James Bond|100|  null|
+----------+---+------+



## substr()

In [0]:
df2.select(col('FullName').substr(3,6).alias('Substrings')).show()

+----------+
|Substrings|
+----------+
|    m Crui|
|    n Vars|
|    mes Bo|
|      null|
+----------+



## when() & otherwise()
--> *similar to SQL case when*

In [0]:
df1.select('Identity',
           'FirstName',
                when(col('Gender') == 'M', "MALE").\
                when(col('Gender') == 'F',"FEMALE").\
                when(col('Gender') == None, '').\
                otherwise(df1.gender).alias('NewGender')).show()

+--------+----------+---------+
|Identity| FirstName|NewGender|
+--------+----------+---------+
|     100|     James|     null|
|     200|       Ann|   FEMALE|
|     400|Tom Cruise|         |
|     400| Tom Brand|     MALE|
+--------+----------+---------+



## isin()

In [0]:
lst = [100,200]
df1.select('Identity','FirstName','LastName','Gender').\
    filter(df1['Identity'].isin(lst)).show()

+--------+---------+--------+------+
|Identity|FirstName|LastName|Gender|
+--------+---------+--------+------+
|     100|    James|    Bond|  null|
|     200|      Ann|   Varsa|     F|
+--------+---------+--------+------+



## getField()

In [0]:
from pyspark.sql.types import StructType,StructField,StringType,ArrayType,MapType
data=[(("James","Bond"),["Java","C#"],{'hair':'black','eye':'brown'}),
      (("Ann","Varsa"),[".NET","Python"],{'hair':'brown','eye':'black'}),
      (("Tom Cruise",""),["Python","Scala"],{'hair':'red','eye':'grey'}),
      (("Tom Brand",None),["Perl","Ruby"],{'hair':'black','eye':'blue'})]

schema = StructType([
        StructField('name', StructType([
            StructField('fname', StringType(), True),
            StructField('lname', StringType(), True)])),
        StructField('languages', ArrayType(StringType()),True),
        StructField('properties', MapType(StringType(),StringType()),True)    ## MapType
     ])
df=spark.createDataFrame(data,schema)
df.show(truncate=False)
df.printSchema()

+-----------------+---------------+-----------------------------+
|name             |languages      |properties                   |
+-----------------+---------------+-----------------------------+
|{James, Bond}    |[Java, C#]     |{eye -> brown, hair -> black}|
|{Ann, Varsa}     |[.NET, Python] |{eye -> black, hair -> brown}|
|{Tom Cruise, }   |[Python, Scala]|{eye -> grey, hair -> red}   |
|{Tom Brand, null}|[Perl, Ruby]   |{eye -> blue, hair -> black} |
+-----------------+---------------+-----------------------------+

root
 |-- name: struct (nullable = true)
 |    |-- fname: string (nullable = true)
 |    |-- lname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [0]:
df.select(df.properties.getField('hair')).show()                  ## from MapType
df.select(df.name.getField('fname').alias('firstName')).show()    ## from Struct

+----------------+
|properties[hair]|
+----------------+
|           black|
|           brown|
|             red|
|           black|
+----------------+

+----------+
| firstName|
+----------+
|     James|
|       Ann|
|Tom Cruise|
| Tom Brand|
+----------+



## getItem()

In [0]:
df.select(df.properties.getItem('hair')).show()
df.select(df.languages.getItem(1)).show()

+----------------+
|properties[hair]|
+----------------+
|           black|
|           brown|
|             red|
|           black|
+----------------+

+------------+
|languages[1]|
+------------+
|          C#|
|      Python|
|       Scala|
|        Ruby|
+------------+



## create_map()

In [0]:
df1.withColumn('Mapping', create_map('Identity','Gender')).show(truncate=False)

+----------+--------+--------+------+-------------+
|FirstName |LastName|Identity|gender|Mapping      |
+----------+--------+--------+------+-------------+
|James     |Bond    |100     |null  |{100 -> null}|
|Ann       |Varsa   |200     |F     |{200 -> F}   |
|Tom Cruise|XXX     |400     |      |{400 -> }    |
|Tom Brand |null    |400     |M     |{400 -> M}   |
+----------+--------+--------+------+-------------+



In [0]:
dfMaps = df1.select('FirstName',create_map('Identity','Gender').alias('Mapps'))
dfMaps.show()
dfMaps.printSchema()                    # MapType(StringType(),StringType())                                               

+----------+-------------+
| FirstName|        Mapps|
+----------+-------------+
|     James|{100 -> null}|
|       Ann|   {200 -> F}|
|Tom Cruise|    {400 -> }|
| Tom Brand|   {400 -> M}|
+----------+-------------+

root
 |-- FirstName: string (nullable = true)
 |-- Mapps: map (nullable = false)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



## Reversing create_map()

In [0]:
dfMaps.select('FirstName',explode('Mapps').alias('keys','vals')).show(truncate=False)

+----------+----+----+
|FirstName |keys|vals|
+----------+----+----+
|James     |100 |null|
|Ann       |200 |F   |
|Tom Cruise|400 |    |
|Tom Brand |400 |M   |
+----------+----+----+

