In [1]:
#we use the findspark library to locate spark on our local machine
import findspark
findspark.init(r'C:\spark\spark-3.5.0-bin-hadoop3')
import pyspark # only run this after findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

data=[("James","Bond","100",None),
      ("Ann","Varsa","200",'F'),
      ("Tom Cruise","XXX","400",''),
      ("Tom Brand",None,"400",'M')] 
columns=["fname","lname","id","gender"]
df=spark.createDataFrame(data,columns)

df.select(...): This code selects specific columns from the DataFrame and applies transformations to them.

df.fname.alias("first_name"): This renames the "fname" column to "first_name" using the alias method. The alias method is used to provide a new name for the selected column.

df.lname.alias("last_name"): This renames the "lname" column to "last_name" using the alias method, similar to the previous line.

expr(" fname ||','|| lname").alias("fullName"): This part of the code creates a new column called "fullName" by using the expr function. Inside the expr, it concatenates the values from the "fname" and "lname" columns along with a comma separator (,).

.show(): Finally, the show method is called on the resulting DataFrame to display its contents in the console.

So, the overall purpose of this code is to transform the original DataFrame df by renaming the "fname" and "lname" columns to "first_name" and "last_name," respectively, and creating a new column "fullName" that contains the concatenation of "fname" and "lname" values separated by a comma. The result is displayed in tabular format using the show method.

In [2]:
#alias
from pyspark.sql.functions import expr
df.select(df.fname.alias("first_name"), \
          df.lname.alias("last_name"), \
          expr(" fname ||','|| lname").alias("fullName") \
   ).show()

+----------+---------+--------------+
|first_name|last_name|      fullName|
+----------+---------+--------------+
|     James|     Bond|    James,Bond|
|       Ann|    Varsa|     Ann,Varsa|
|Tom Cruise|      XXX|Tom Cruise,XXX|
| Tom Brand|     NULL|          NULL|
+----------+---------+--------------+



In [3]:
#asc, desc
df.sort(df.fname.asc()).show()
df.sort(df.fname.desc()).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|       Ann|Varsa|200|     F|
|     James| Bond|100|  NULL|
| Tom Brand| NULL|400|     M|
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
| Tom Brand| NULL|400|     M|
|     James| Bond|100|  NULL|
|       Ann|Varsa|200|     F|
+----------+-----+---+------+



In [4]:
#cast
df.select(df.fname,df.id.cast("int")).printSchema()

root
 |-- fname: string (nullable = true)
 |-- id: integer (nullable = true)



In [5]:
#between
df.filter(df.id.between(100,300)).show()

+-----+-----+---+------+
|fname|lname| id|gender|
+-----+-----+---+------+
|James| Bond|100|  NULL|
|  Ann|Varsa|200|     F|
+-----+-----+---+------+



In [6]:
#contains
df.filter(df.fname.contains("Cruise")).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



In [7]:
#startswith, endswith()
df.filter(df.fname.startswith("T")).show()
df.filter(df.fname.endswith("Cruise")).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
| Tom Brand| NULL|400|     M|
+----------+-----+---+------+

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



In [8]:
#isNull & isNotNull
df.filter(df.lname.isNull()).show()
df.filter(df.lname.isNotNull()).show()

+---------+-----+---+------+
|    fname|lname| id|gender|
+---------+-----+---+------+
|Tom Brand| NULL|400|     M|
+---------+-----+---+------+

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|     James| Bond|100|  NULL|
|       Ann|Varsa|200|     F|
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



In [15]:
#like , rlike
df.select(df.fname,df.lname,df.id) \
  .filter(df.fname.like("%Tom%")).show()

+----------+-----+---+
|     fname|lname| id|
+----------+-----+---+
|Tom Cruise|  XXX|400|
| Tom Brand| NULL|400|
+----------+-----+---+



df.fname.substr(1, 2): This part of the code uses the substr function to extract a substring from the "fname" column. The substr function takes two arguments: the starting position (1 in this case) and the length of the substring (2 in this case). This means it will start from the first character and take the next two characters from the "fname" column.

.alias("substr"): After extracting the substring, the alias method is used to rename the resulting column to "substr." This provides a new name for the selected column.

.show(): Finally, the show method is called on the resulting DataFrame to display its contents in the console.

So, the overall purpose of this code is to create a new DataFrame that contains a column called "substr," which contains substrings of length 2 extracted from the "fname" column of the original DataFrame df. The result is displayed in tabular format using the show method.

In [16]:
#substr
df.select(df.fname.substr(1,2).alias("substr")).show()

+------+
|substr|
+------+
|    Ja|
|    An|
|    To|
|    To|
+------+



In [17]:
#when & otherwise
from pyspark.sql.functions import when
df.select(df.fname,df.lname,when(df.gender=="M","Male") \
              .when(df.gender=="F","Female") \
              .when(df.gender==None ,"") \
              .otherwise(df.gender).alias("new_gender") \
    ).show()

+----------+-----+----------+
|     fname|lname|new_gender|
+----------+-----+----------+
|     James| Bond|      NULL|
|       Ann|Varsa|    Female|
|Tom Cruise|  XXX|          |
| Tom Brand| NULL|      Male|
+----------+-----+----------+



In [18]:
#isin
li=["100","200"]
df.select(df.fname,df.lname,df.id) \
  .filter(df.id.isin(li)) \
  .show()

+-----+-----+---+
|fname|lname| id|
+-----+-----+---+
|James| Bond|100|
|  Ann|Varsa|200|
+-----+-----+---+



The provided PySpark code defines a custom schema for a DataFrame using the StructType and related data types (StructField, StringType, ArrayType, and MapType). Let's break down what this schema represents:

StructType([...]): This is the top-level schema definition, and it represents the structure of a DataFrame with multiple columns.

StructField('name', StructType([...]), True): This defines a column named "name" with a nested structure. It's a StructType itself with two nested fields: "fname" and "lname," both of type StringType(). The True argument indicates that this column allows null values.

StructField('languages', ArrayType(StringType()), True): This defines a column named "languages" as an array of strings. The ArrayType(StringType()) represents an array where each element is a string. The True argument indicates that this column allows null values.

StructField('properties', MapType(StringType(), StringType()), True): This defines a column named "properties" as a map of strings to strings. The MapType(StringType(), StringType()) represents a map where both keys and values are strings. The True argument indicates that this column allows null values.

In summary, this schema defines a DataFrame with three columns:

"name": A nested structure with two fields, "fname" and "lname," both of type string.
"languages": An array of strings.
"properties": A map where both keys and values are strings.
This schema can be used as a blueprint when creating a PySpark DataFrame to ensure that the DataFrame has the specified structure and data types for its columns.

In [21]:
from pyspark.sql.types import StructType,StructField,StringType,ArrayType,MapType
data=[(("James","Bond"),["Java","C#"],{'hair':'black','eye':'brown'}),
      (("Ann","Varsa"),[".NET","Python"],{'hair':'brown','eye':'black'}),
      (("Tom Cruise",""),["Python","Scala"],{'hair':'red','eye':'grey'}),
      (("Tom Brand",None),["Perl","Ruby"],{'hair':'black','eye':'blue'})]

schema = StructType([
        StructField('name', StructType([
            StructField('fname', StringType(), True),
            StructField('lname', StringType(), True)])),
        StructField('languages', ArrayType(StringType()),True),
        StructField('properties', MapType(StringType(),StringType()),True)
     ])
df=spark.createDataFrame(data,schema)
df.printSchema()
df.show()

root
 |-- name: struct (nullable = true)
 |    |-- fname: string (nullable = true)
 |    |-- lname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+-----------------+---------------+--------------------+
|             name|      languages|          properties|
+-----------------+---------------+--------------------+
|    {James, Bond}|     [Java, C#]|{eye -> brown, ha...|
|     {Ann, Varsa}| [.NET, Python]|{eye -> black, ha...|
|   {Tom Cruise, }|[Python, Scala]|{eye -> grey, hai...|
|{Tom Brand, NULL}|   [Perl, Ruby]|{eye -> blue, hai...|
+-----------------+---------------+--------------------+



In [20]:
#getItem()
df.select(df.languages.getItem(1)).show()

df.select(df.properties.getItem("hair")).show()

+------------+
|languages[1]|
+------------+
|          C#|
|      Python|
|       Scala|
|        Ruby|
+------------+

+----------------+
|properties[hair]|
+----------------+
|           black|
|           brown|
|             red|
|           black|
+----------------+



In [22]:
#getField from Struct or Map
df.select(df.properties.getField("hair")).show()

df.select(df.name.getField("fname")).show()

+----------------+
|properties[hair]|
+----------------+
|           black|
|           brown|
|             red|
|           black|
+----------------+

+----------+
|name.fname|
+----------+
|     James|
|       Ann|
|Tom Cruise|
| Tom Brand|
+----------+

