# Spark Introduction

This notebook introduces Apache Spark fundamentals including DataFrames, RDDs, and basic operations.


In [None]:
# Import libraries
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import matplotlib.pyplot as plt

print("Libraries imported successfully!")

In [None]:
# Create Spark Session
spark = SparkSession.builder \
    .appName("Spark-Introduction") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .enableHiveSupport() \
    .getOrCreate()

print(f"Spark Version: {spark.version}")
print(f"Spark Context: {spark.sparkContext}")
print(f"Web UI: {spark.sparkContext.uiWebUrl}")

In [None]:
# Create sample data
sales_data = [
    (1, "2023-01-01", "Electronics", "Laptop", 1200.00, 2),
    (2, "2023-01-02", "Electronics", "Phone", 800.00, 1),
    (3, "2023-01-03", "Clothing", "Shirt", 50.00, 3),
    (4, "2023-01-04", "Electronics", "Tablet", 400.00, 1),
    (5, "2023-01-05", "Books", "Python Guide", 30.00, 2),
    (6, "2023-01-06", "Clothing", "Jeans", 80.00, 1),
    (7, "2023-01-07", "Electronics", "Mouse", 25.00, 4),
    (8, "2023-01-08", "Books", "Data Science", 45.00, 1),
    (9, "2023-01-09", "Electronics", "Keyboard", 60.00, 2),
    (10, "2023-01-10", "Clothing", "Shoes", 120.00, 1)
]

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("date", StringType(), True),
    StructField("category", StringType(), True),
    StructField("product", StringType(), True),
    StructField("price", DoubleType(), True),
    StructField("quantity", IntegerType(), True)
])

df = spark.createDataFrame(sales_data, schema)
print("Sample sales data created:")
df.show()
df.printSchema()

In [None]:
# Basic DataFrame operations
print("DataFrame Info:")
print(f"Number of rows: {df.count()}")
print(f"Number of columns: {len(df.columns)}")
print(f"Columns: {df.columns}")

print("\nDataFrame Statistics:")
df.describe().show()

In [None]:
# Data transformations
# Add calculated column for total amount
df_with_total = df.withColumn("total_amount", col("price") * col("quantity"))

print("DataFrame with total amount:")
df_with_total.show()

# Filter electronics products
electronics_df = df_with_total.filter(col("category") == "Electronics")
print("\nElectronics products only:")
electronics_df.show()

In [None]:
# Aggregations
print("Sales by Category:")
category_sales = df_with_total.groupBy("category") \
    .agg(
        sum("total_amount").alias("total_sales"),
        avg("price").alias("avg_price"),
        count("*").alias("num_items")
    ) \
    .orderBy(desc("total_sales"))

category_sales.show()

# Convert to Pandas for visualization
category_pandas = category_sales.toPandas()
print("\nCategory sales as Pandas DataFrame:")
print(category_pandas)

In [None]:
# Visualization
plt.figure(figsize=(10, 6))

# Bar chart of sales by category
plt.subplot(1, 2, 1)
plt.bar(category_pandas['category'], category_pandas['total_sales'])
plt.title('Total Sales by Category')
plt.xlabel('Category')
plt.ylabel('Total Sales ($)')
plt.xticks(rotation=45)

# Pie chart of item count by category
plt.subplot(1, 2, 2)
plt.pie(category_pandas['num_items'], labels=category_pandas['category'], autopct='%1.1f%%')
plt.title('Number of Items by Category')

plt.tight_layout()
plt.show()

In [None]:
# Working with RDDs (lower-level API)
rdd = spark.sparkContext.parallelize(range(1, 1001))

# Map, filter, reduce operations
squared_rdd = rdd.map(lambda x: x ** 2)
even_squares = squared_rdd.filter(lambda x: x % 2 == 0)
sum_even_squares = even_squares.reduce(lambda a, b: a + b)

print(f"Sum of even squares from 1 to 1000: {sum_even_squares}")
print(f"First 10 even squares: {even_squares.take(10)}")

In [None]:
# Window functions
from pyspark.sql.window import Window

# Add row numbers and running totals
window_spec = Window.partitionBy("category").orderBy(desc("total_amount"))

df_windowed = df_with_total.withColumn(
    "rank_in_category", 
    row_number().over(window_spec)
).withColumn(
    "running_total", 
    sum("total_amount").over(window_spec.rowsBetween(Window.unboundedPreceding, Window.currentRow))
)

print("DataFrame with window functions:")
df_windowed.select("category", "product", "total_amount", "rank_in_category", "running_total").show()

In [None]:
# Save data to HDFS
hdfs_path = "hdfs://namenode:9000/user/data/sales_data"

df_with_total.write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv(hdfs_path)

print(f"Sales data saved to HDFS: {hdfs_path}")

# Verify the save
df_loaded = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(hdfs_path)

print("\nData loaded back from HDFS:")
df_loaded.show(5)

In [None]:
# Performance monitoring
print("Spark Application Information:")
print(f"Application ID: {spark.sparkContext.applicationId}")
print(f"Application Name: {spark.sparkContext.appName}")
print(f"Default Parallelism: {spark.sparkContext.defaultParallelism}")
print(f"Web UI URL: {spark.sparkContext.uiWebUrl}")

# Clean up
spark.stop()
print("\nSpark session stopped successfully!")