# Transformation
- get a new df after execution of each transformation operation
- lazy evaluation based on DAG
- 2 types: narrow, wide (requires shuffling vs not required)
- Eg: filter, union

# Action
- return the data to the driver for display or storage into storage layer
- when we want to work with actual data
- Eg: count, collect, save

In [0]:
data = [(1, "a"), (2, "b"), (3, "c")]
df = spark.createDataFrame(data, ["id", "name"])

# show VS collect

In [0]:
df.show()

# VS

data = df.collect()  # Retrieves all rows into a Python list
for row in data:
    print(row)  # Each row is a PySpark Row object

# read using list
print(data[0].id, data[0].name)
print(data[1]["id"], data[1]["name"])

# Convert each Row object into a dictionary
rows_as_dicts = [row.asDict() for row in data]
print("rows_as_dicts:", rows_as_dicts)
print(rows_as_dicts[0]['id'], rows_as_dicts[0]['name'])

# Convert into 1 dictionary
data_one_dict = {item["id"]:item["name"] for item in rows_as_dicts}
print("data_one_dict: ", data_one_dict)

# filter

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField 
from pyspark.sql.types import StringType, IntegerType, ArrayType

# Create SparkSession object
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

# Create data
data = [
    (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
    (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
    (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
    (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
    (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
    (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
 ]

# Create schema        
schema = StructType([
     StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
     ])),
     StructField('languages', ArrayType(StringType()), True),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
 ])

# Create dataframe
df = spark.createDataFrame(data = data, schema = schema)
df.printSchema()
df.show(truncate=False)

In [0]:
df.filter(df.state == "OH").show(truncate=False)

In [0]:
df.filter(df.state != "OH").show(truncate=False) 

df.filter(~(df.state == "OH")).show(truncate=False)

In [0]:
from pyspark.sql import functions as F
df.filter(F.col("state") == "OH").show(truncate=False)

In [0]:
# Using SQL Expression
df.filter("gender == 'M'").show()

# For not equal
df.filter("gender != 'M'").show()
df.filter("gender <> 'M'").show()

In [0]:
df.filter( (df.state  == "OH") & (df.gender  == "M") ).show(truncate=False)  

In [0]:
li=["OH","CA","DE"]
df.filter(df.state.isin(li)).show()

In [0]:
# Using startswith
df.filter(df.state.startswith("N")).show()

#using endswith
df.filter(df.state.endswith("H")).show()

#contains
df.filter(df.state.contains("H")).show()

In [0]:
data2 = [
    (2,"Michael rOse"),
    (3,"Robert Williams"),
    (4,"Rames Rose"),
    (5,"Rames rose")
  ]
df2 = spark.createDataFrame(data = data2, schema = ["id","name"])

# like - SQL LIKE pattern
# This check case sensitive
df2.filter(df2.name.like("%rose%")).show()

# SQL ILIKE expression (case insensitive LIKE). 
# Returns a boolean Column based on a case insensitive match.
df2.filter(df2.name.ilike('%Rose')).show()

# rlike - SQL RLIKE pattern (LIKE with Regex)
# regular expression (regex) matching
# This check case insensitive
df2.filter(df2.name.rlike("(?i)^*rose$")).show()

# (?i) -> Indicates case insensitivity for the regex matching.
# ^ -> matching must occur at the beginning of the string.