## Creating RDDs from scratch

In [1]:
from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext(appName="ExampleRDD")

# Create an RDD from a list of elements
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Perform operations on the RDD
squared_rdd = rdd.map(lambda x: x**2)
filtered_rdd = squared_rdd.filter(lambda x: x > 10)
sum_value = filtered_rdd.sum()

# Print the RDD elements and the sum
print("RDD Elements:")
for element in filtered_rdd.collect():
    print(element)

print("Sum:", sum_value)


RDD Elements:
16
25
Sum: 41


## Creating a Dataframe from scratch

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Create a SparkSession
spark = SparkSession.builder.appName("ExampleDataFrame").getOrCreate()

# Define the schema for the DataFrame
schema = StructType([
    StructField("name", StringType(), nullable=False),
    StructField("age", IntegerType(), nullable=False),
    StructField("city", StringType(), nullable=False)
])

# Create the data for the DataFrame
data = [
    {"name": "Alice", "age": 25, "city": "New York"},
    {"name": "Bob", "age": 30, "city": "San Francisco"},
    {"name": "Charlie", "age": 35, "city": "Los Angeles"}
]

# Create the DataFrame
df = spark.createDataFrame(data, schema)

# Show the DataFrame
df.show()


+-------+---+-------------+
|   name|age|         city|
+-------+---+-------------+
|  Alice| 25|     New York|
|    Bob| 30|San Francisco|
|Charlie| 35|  Los Angeles|
+-------+---+-------------+



## Creating a DataSet from scratch

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create a SparkSession
spark = SparkSession.builder.appName("DataFrameToDataset").getOrCreate()

# Define a class representing the structure of your data
class Person:
    def __init__(self, name, age):
        self.name = name
        self.age = age

# Create a DataFrame
data = [("Alice", 25),
        ("Bob", 30),
        ("Charlie", 35)]

columns = ["name", "age"]
df = spark.createDataFrame(data, columns)

# Convert DataFrame to Dataset using alias method
dataset = df.select(col("name"), col("age")).alias("Person")

# Show the Dataset
dataset.show()

+-------+---+
|   name|age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+



# Spark with SQL

In [4]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("ExampleDataFrameSQL").getOrCreate()

# Create a DataFrame
data = [("Alice", 25, "New York"),
        ("Bob", 30, "San Francisco"),
        ("Charlie", 35, "Los Angeles")]

columns = ["name", "age", "city"]
df = spark.createDataFrame(data, columns)

# Register the DataFrame as a temporary table
df.createOrReplaceTempView("people")

# Perform SQL queries on the DataFrame
sql_query = """
    SELECT name, age
    FROM people
    WHERE age >= 30
"""

result = spark.sql(sql_query)

# Show the query result
result.show()


+-------+---+
|   name|age|
+-------+---+
|    Bob| 30|
|Charlie| 35|
+-------+---+



Alternatively, same code without SQL:

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create a SparkSession
spark = SparkSession.builder.appName("ExampleDataFrameNoSQL").getOrCreate()

# Create a DataFrame
data = [("Alice", 25, "New York"),
        ("Bob", 30, "San Francisco"),
        ("Charlie", 35, "Los Angeles")]

columns = ["name", "age", "city"]
df = spark.createDataFrame(data, columns)

# Perform DataFrame operations
result = df.select("name", "age").where(col("age") >= 30)

# Show the result
result.show()


+-------+---+
|   name|age|
+-------+---+
|    Bob| 30|
|Charlie| 35|
+-------+---+

