# PySpark Quickstart Guide

Introduction to Apache Spark with Python, covering basic concepts and operations.

## 1. Initialize Spark Session

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Create Spark session
spark = SparkSession.builder \
    .appName("PySpark Quickstart") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

# Set log level to reduce noise
spark.sparkContext.setLogLevel("WARN")

print(f"Spark version: {spark.version}")
print(f"Spark UI: {spark.sparkContext.uiWebUrl}")

## 2. Working with RDDs

In [None]:
# Create RDD from a list
numbers = spark.sparkContext.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# Basic RDD operations
print("Numbers:", numbers.collect())
print("Count:", numbers.count())
print("Sum:", numbers.reduce(lambda a, b: a + b))
print("Even numbers:", numbers.filter(lambda x: x % 2 == 0).collect())
print("Squared:", numbers.map(lambda x: x * x).collect())

## 3. Working with DataFrames

In [None]:
# Create sample data
data = [
    ("Alice", 25, "Engineer", 75000),
    ("Bob", 30, "Data Scientist", 85000),
    ("Charlie", 35, "Manager", 95000),
    ("Diana", 28, "Engineer", 78000),
    ("Eve", 32, "Data Scientist", 88000)
]

# Define schema
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("job", StringType(), True),
    StructField("salary", IntegerType(), True)
])

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Basic DataFrame operations
print("Schema:")
df.printSchema()

print("\nData:")
df.show()

print("\nStatistics:")
df.describe().show()

## 4. DataFrame Operations

In [None]:
# Select columns
print("Names and salaries:")
df.select("name", "salary").show()

# Filter data
print("\nEngineers:")
df.filter(col("job") == "Engineer").show()

# Group by and aggregate
print("\nAverage salary by job:")
df.groupBy("job").agg(avg("salary").alias("avg_salary")).show()

# Add new column
print("\nWith bonus column:")
df.withColumn("bonus", col("salary") * 0.1).show()

## 5. SQL Queries

In [None]:
# Register DataFrame as temporary view
df.createOrReplaceTempView("employees")

# Execute SQL queries
result = spark.sql("""
    SELECT job, 
           COUNT(*) as count,
           AVG(salary) as avg_salary,
           MAX(age) as max_age
    FROM employees 
    GROUP BY job
    ORDER BY avg_salary DESC
""")

result.show()

## 6. Performance Tips

In [None]:
# Check partitions
print(f"Number of partitions: {df.rdd.getNumPartitions()}")

# Cache frequently used DataFrames
df.cache()

# Repartition for better performance
df_repartitioned = df.repartition(4)
print(f"After repartitioning: {df_repartitioned.rdd.getNumPartitions()}")

# Explain query plan
df.filter(col("salary") > 80000).explain()

## 7. Cleanup

In [None]:
# Stop Spark session
spark.stop()
print("Spark session stopped.")