# PySpark Cheat Sheet
A comprehensive reference of commonly used PySpark commands for data engineers and analysts.

## Setup & Initialization

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MyApp") \
    .getOrCreate()

## Read & Write Data

In [None]:
df = spark.read.csv("path/file.csv", header=True, inferSchema=True)
df = spark.read.json("path/file.json")
df = spark.read.parquet("path/file.parquet")

df.write.csv("output.csv", header=True, mode="overwrite")
df.write.json("output.json", mode="overwrite")
df.write.parquet("output.parquet")

## Data Exploration

In [None]:
df.show()
df.printSchema()
df.columns
df.describe().show()
df.count()
df.dtypes

## DataFrame Transformations

In [None]:
from pyspark.sql.functions import col, lit, when, expr

df.select("col1", "col2")
df.withColumn("new_col", col("existing_col") * 10)
df.withColumnRenamed("old", "new")
df.drop("col_to_drop")
df.withColumn("flag", when(col("value") > 100, 1).otherwise(0))

## Filtering & Sorting

In [None]:
df.filter(col("age") > 30)
df.where("age > 30 AND city = 'NY'")
df.orderBy("salary")
df.orderBy(col("salary").desc())

## Aggregations & Grouping

In [None]:
df.groupBy("department").count()
df.groupBy("department").agg({"salary": "avg", "age": "max"})

from pyspark.sql.functions import avg, max

df.groupBy("department").agg(
    avg("salary").alias("avg_salary"),
    max("age").alias("max_age")
)

## Joins

In [None]:
df1.join(df2, df1.id == df2.id, "inner")

## Window Functions

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, rank

windowSpec = Window.partitionBy("department").orderBy("salary")

df.withColumn("rank", rank().over(windowSpec))
df.withColumn("row_num", row_number().over(windowSpec))

## Type Casting

In [None]:
df.withColumn("age_int", col("age").cast("int"))

## Handling Missing Values

In [None]:
df.dropna()
df.fillna({"age": 0, "name": "Unknown"})

## User Defined Functions (UDFs)

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def upper_case(name):
    return name.upper()

upper_udf = udf(upper_case, StringType())
df.withColumn("upper_name", upper_udf(col("name")))

## SQL with PySpark

In [None]:
df.createOrReplaceTempView("people")
spark.sql("SELECT name, age FROM people WHERE age > 30")

## Schema Definition

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])

df = spark.read.schema(schema).csv("people.csv")

## Partitioning & Performance

In [None]:
df.repartition(4)
df.coalesce(1)
df.cache()
df.persist()