# Example 2: Sales Analysis

Analyze sales data to find insights.

This demonstrates:
- Creating DataFrames from data
- Column calculations
- Grouping and aggregations
- Filtering and sorting

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

spark = SparkSession.builder.appName("SalesAnalysis").getOrCreate()

In [None]:
# Sample sales data
sales_data = [
    ("2023-01", "Electronics", "Laptop", 5, 999.99),
    ("2023-01", "Electronics", "Mouse", 20, 25.50),
    ("2023-01", "Books", "Python Guide", 15, 45.00),
    ("2023-02", "Electronics", "Laptop", 8, 999.99),
    ("2023-02", "Books", "Data Science", 12, 55.00),
    ("2023-02", "Furniture", "Desk", 3, 299.99),
    ("2023-03", "Electronics", "Monitor", 10, 299.99),
    ("2023-03", "Furniture", "Chair", 15, 149.99),
    ("2023-03", "Books", "ML Basics", 8, 49.99)
]

schema = StructType([
    StructField("month", StringType(), True),
    StructField("category", StringType(), True),
    StructField("product", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("price", DoubleType(), True)
])

df = spark.createDataFrame(sales_data, schema)
print("Sales Data:")
df.show()

In [None]:
# Calculate total revenue per sale
df_with_revenue = df.withColumn("revenue", F.col("quantity") * F.col("price"))

print("\nSales with Revenue:")
df_with_revenue.show()

In [None]:
# Total revenue by category
revenue_by_category = df_with_revenue.groupBy("category").agg(
    F.sum("revenue").alias("total_revenue"),
    F.sum("quantity").alias("total_quantity"),
    F.count("*").alias("num_transactions")
).orderBy(F.col("total_revenue").desc())

print("\nRevenue by Category:")
revenue_by_category.show()

In [None]:
# Total revenue by month
revenue_by_month = df_with_revenue.groupBy("month").agg(
    F.sum("revenue").alias("total_revenue")
).orderBy("month")

print("\nRevenue by Month:")
revenue_by_month.show()

In [None]:
# Top 3 products by revenue
top_products = df_with_revenue.groupBy("product").agg(
    F.sum("revenue").alias("total_revenue")
).orderBy(F.col("total_revenue").desc()).limit(3)

print("\nTop 3 Products by Revenue:")
top_products.show()

In [None]:
# High-value transactions (revenue > $1000)
high_value = df_with_revenue.filter(F.col("revenue") > 1000)

print("\nHigh-Value Transactions (>$1000):")
high_value.show()

In [None]:
# Overall statistics
overall_stats = df_with_revenue.agg(
    F.sum("revenue").alias("total_revenue"),
    F.avg("revenue").alias("avg_revenue"),
    F.max("revenue").alias("max_revenue"),
    F.min("revenue").alias("min_revenue")
)

print("\nOverall Statistics:")
overall_stats.show()

In [None]:
spark.stop()