In [0]:
import pandas as pd

In [0]:
spark.range(10).show()

In [0]:
spark.sql("SELECT 1+4").show()

In [0]:
df = spark.read.csv("/Volumes/workspace/default/kiran_data/titanic.csv", header=True, inferSchema=True)
df.printSchema()

In [0]:
df.show(5)

In [0]:
df.select("PassengerId", "Survived", "Name", "Ticket").show(5)

In [0]:
df.count()

# Filter Rows

In [0]:
df.filter(df["Sex"] == "female").count()

In [0]:
from pyspark.sql.functions import col, lit, when

df.filter(col('Sex')== 'female').show()

In [0]:
df.filter(df["Sex"] != "female").show(5)

In [0]:
df.filter((df.Pclass == 1) | (df.Pclass == 2)).show(5) #or operator

In [0]:
df.filter((df["Sex"] == "female") & (df["Age"] >= 20)).show(5) #& operator

In [0]:
df.filter(df.Age.isNull()).show()

In [0]:
df.filter(df.Age.isin([40, 41, 42])).show()

In [0]:
# Names starting with 'A'
df.filter(df.Name.like("A%")).show(5)


In [0]:
# Names ending with 'y'
df.filter(df.Name.rlike("y$")).show(5)

In [0]:
df.filter(df.Name.rlike("^S")).show(5) #name starts with 'S'

In [0]:
df.filter(df.Name.startswith("C")).show(5) #startswith

In [0]:
df.filter(df.Name.contains("ar")).show(5)

In [0]:
# Age between 20 and 40
df.filter(df.Age.between(20,40)).show(5)


# Filter with conditional column

In [0]:
# Withcolumn

# Add column: AdultFlag
df = df.withColumn("AdultFlag", when(col("Age")>=18, "Adult").otherwise("Minor"))
# Filter adults
df.filter(df.AdultFlag == "Adult").show(5)


#Filter using literal values

In [0]:
# Fare greater than 100
df.filter(col("Fare") > lit(100)).show(5)

✅ Key Points

Use lit() whenever you want to treat a constant as a column in DataFrame operations.

Required in comparisons, .withColumn(), or .when() expressions.

Alternative to lit(): some simple operations allow Python literals directly, but using lit() is safer and explicit, especially for complex transformations.

In [0]:
from pyspark.sql.functions import lower
# Female passengers, ignoring case
df.filter(lower(col("Sex")) == "female").show(5)


In [0]:
# Specific passenger IDs
df.filter(df.PassengerId.isin([1,5,10])).show(5)


# Filter top N using sort + limit

In [0]:
# Top 5 passengers by Fare
df.orderBy(col("Fare").desc()).limit(5).show()

limit(5) Purpose: Creates a new DataFrame with only the first 5 rows.

Effect on DataFrame: Returns a subset DataFrame. Can be stored, transformed, or written to disk.

Use case: You want to actually restrict data for further processing.

In [0]:
df.createOrReplaceTempView("titanic_table") #register dataframe as temporary SQL table
#Temp view: Exists only for the current Spark session; disappears when you close the notebook/session.
spark.sql("SELECT * FROM titanic_table WHERE Sex='female' AND Age>30").show(5)

In [0]:
spark.sql("select * from titanic_table").show(10)

💡 Pro Tip:

.show() is standard PySpark → always works anywhere.

display() is Databricks-only → useful for demos, dashboards, and exploratory analysis.

In [0]:
flightData2011 = spark\
    .read\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .csv("/Volumes/workspace/default/kiran_data/2011-summary.csv")

In [0]:
flightData2011.take(3) #collecting a df

In [0]:
flightData2011.sort("count").explain() # We can call explain on any Data‐
# Frame object to see the DataFrame’s lineage (or how Spark will execute this query):

In [0]:
#By default, when we perform a shuffle, Spark outputs 200 shuffle partitions. Let’s set this value to 5 to reduce the number of the output partitions from the shuffle:
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [0]:
flightData2011.sort("count").take(2)

In [0]:
flightData2011.createOrReplaceTempView("flight_data_2011")

In [0]:
sqlWay = spark.sql("""SELECT DEST_COUNTRY_NAME, count(1)
          from flight_data_2011
          group by DEST_COUNTRY_NAME
          """)

In [0]:
sqlWay.explain()

In [0]:
dataFrameWay = flightData2011\
.groupBy("DEST_COUNTRY_NAME")\
.count()
dataFrameWay.explain()

In [0]:
#maximum flight to and from any given location.
flightData2011.show(10)

In [0]:
spark.sql("""SELECT max(count) from flight_data_2011""").take(1)