# RDD vs. DataFrame

In [1]:
# Prerequisites
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.master("local").getOrCreate()
sc = spark.sparkContext
print("Spark Version: ", spark.version)

Spark Version:  3.5.0


### RDD

In [4]:
# Create a RDD of Tuples
data_RDD = sc.parallelize([("Brooke", 20), ("Anna", 31), ("John", 30), ("Ted", 35), ("Brooke", 25), ("Anna", 33)])

In [6]:
# Use map and reducekey transformations with lambda functions
# to aggregate and compute average
ages_RDD = (data_RDD
            .map(lambda x: (x[0], (x[1],1)))
            .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
            .map(lambda x: (x[0], x[1][0]/x[1][1]))
            )

### DataFrame API

In [7]:
data_df = spark.createDataFrame([("Brooke", 20), ("Anna", 31), ("John", 19), ("Ted", 55), ("Brooke", 25), 
                                 ("Anna", 33)],["name", "age"])

# Group same names together, aggragete together and compute average
ave_df = data_df.groupBy("name").agg(avg("age"))
ave_df.show()

+------+--------+
|  name|avg(age)|
+------+--------+
|Brooke|    22.5|
|  Anna|    32.0|
|  John|    19.0|
|   Ted|    55.0|
+------+--------+

