In [None]:
# import modules
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [None]:
# Map: applying a function to each element in the dataset
sc = SparkContext("local", "MapExample")
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)
mapped_rdd = rdd.map(lambda x: x*2)
mapped_rdd.collect()

In [None]:
# Filter: selecting elements based on a specific condition
sc = SparkContext("local", "FilterExample")
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)
mapped_rdd = rdd.filter(lambda x: x % 2 == 0)
mapped_rdd.collect()

In [None]:
# Union: combining two datasets with the same schema
sc = SparkContext("local", "UnionExample")
rdd1 = sc.parallelize([1, 2, 3])
rdd2 = sc.parallelize([4, 5, 6])
union_rdd = rdd1.union(rdd2)
union_rdd.collect()

In [None]:
# GroupBy: Aggregating data based on a specific key
sc = SparkContext("local", "GoupByExample")
data = [("apple", 2), ("banana", 3), ("apple", 5), ("banana", 1)]
rdd = sc.parallelize(data)
grouped_data = rdd.groupBy(lambda x: x[0])
grouped_data.collect()

In [None]:
# Join: Combining two datasets based on a common key
sc = SparkContext("local", "JoinExample")
rdd1 = sc.parallelize(("apple", 2), ("banana", 3))
rdd2 = sc.parallelize(("apple", 5), ("banana", 1))
joined_rdd = rdd1.join(rdd2)
joined_rdd.collect()

In [None]:
# Sort: rearranging data based on a specific criterion
sc = SparkContext("local", "SortExample")
data = [4, 2, 1, 3, 5]
rdd = sc.parallelize(data)
sorted_rdd = rdd.sortBy(lambda x: x, ascending=True)
sorted_rdd.collect()

In [None]:
# PySpark DF: rule-based common transformations
# 1. Predicate pushdown: Pushing filtering conditions closer to the data source before processing to minimize data movement.
# 2. Constant folding: Evaluating constant expressions during query compilation to reduce computation during runtime.
# 3. Column pruning: Eliminating unnecessary columns from the query plan to enhance processing efficiency.
# 4. Join reordering: Rearranging join operations to minimize the intermediate data size and enhance the join performance.

# Create a spark session
spark = SparkSession.builder.appName("RuleBasedTransformations").getOrCreate()

# Sample input data for df1 and df2
data1 = [
    ("Alice", 25, "F"),
    ("Bob", 30, "M"),
    ("Charlie", 22, "M"),
    ("Diana", 28, "F")]
data2 = [
    ("Alice", "New York"),
    ("Bob", "San Francisco"),
    ("Charlie", "Los Angeles"),
    ("Eve", "Chicago")]

# Create dfs
columns1 = ["name", "age", "gender"]
df1 = spark.createDataFrame(data1, columns1)
columns2 = ["name", "city"]
df2 = spark.createDataFrame(data2, columns2)

# Applying predicate pushdown (filtering)
filtered_df = df1.filter(col("age") > 25)

# Applying constant folding
folded_df = filtered_df.select(col("name"), col("age") + 2)

# Applying column pruning
pruned_df = folded_df.select(col("name"))

# Join reordering
reordered_join = df1.join(df2, on="name")

# Show the final results
print("Filtered df:")
filtered_df.show()

print("Folded df:")
folded_df.show()

print("Pruned df:")
pruned_df.show()

print("Reordered join df:")
reordered_join.show()

# Stop the spark session
spark.stop()

In [None]:
# Cost-based optimization techniques in Spark
# 1. Adaptive query execution: Dynamically adjusts the query plan during execution based on runtime statistics to optimize performance.
# 2. Cost-based join reordering: Optimizes join order based on estimated costs of different join paths.
# 3. Broadcast hash join: Optimizes small-table joins by broadcasting one table to all nodes, reducing data shuffling.
# 4. Shuffle partitioning and memory management: Efficiently manages data shuffling during operations like groupBy and aggregation and optimizes memory usage.

# Create a spark session
spark = SparkSession.builder.appName("CostBasedOptimization").getOrCreate()

# Sample input data for df1 and df2 
data1 = [
    ("Alice", 25),
    ("Bob", 30),
    ("Charlie", 22),
    ("Diana", 28)]
data2 = [
    ("Alice", "New York"),
    ("Bob", "San Francisco"),
    ("Charlie", "Los Angeles"),
    ("Eve", "Chicago")]

# Create dfs
columns1 = ["name", "age"]
df1 = spark.createDataFrame(df1, columns1)
columns2 = ["name", "city"]
df2 = spark.createDataFrame(df2, columns2)

# Enable adaptive query session
spark.conf.set("spark.sql.adaptive.enabled", "true")

# Applying adaptive query session execution (runtime adaptive optimization)
optimized_join = df1.join(df2, on="name")

# Show the optimized join result
print("Optimized join df:")
optimized_join.show()

# Stop the spark session
spark.stop()