# Hive SQL Operations

This notebook demonstrates how to use Hive SQL with Spark for data warehousing operations.


In [None]:
# Import libraries
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

print("Libraries imported successfully!")

In [None]:
# Create Spark Session with Hive support
spark = SparkSession.builder \
    .appName("Hive-SQL-Operations") \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

print(f"Spark Version: {spark.version}")
print(f"Catalog Implementation: {spark.conf.get('spark.sql.catalogImplementation')}")
print(f"Warehouse Directory: {spark.conf.get('spark.sql.warehouse.dir')}")

In [None]:
# Check existing databases
print("Existing databases:")
spark.sql("SHOW DATABASES").show()

# Create a new database for our examples
spark.sql("CREATE DATABASE IF NOT EXISTS bigdata_demo")
spark.sql("USE bigdata_demo")
print("\nUsing database: bigdata_demo")

In [None]:
# Create sample customer data
customer_data = [
    (1, "John Doe", "john@email.com", "New York", "Premium"),
    (2, "Jane Smith", "jane@email.com", "California", "Standard"),
    (3, "Bob Johnson", "bob@email.com", "Texas", "Premium"),
    (4, "Alice Brown", "alice@email.com", "Florida", "Basic"),
    (5, "Charlie Wilson", "charlie@email.com", "New York", "Standard")
]

customer_schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("state", StringType(), True),
    StructField("membership", StringType(), True)
])

customer_df = spark.createDataFrame(customer_data, customer_schema)
print("Customer data created:")
customer_df.show()

In [None]:
# Create sample order data
order_data = [
    (101, 1, "2023-01-15", 250.00, "Electronics"),
    (102, 2, "2023-01-16", 120.00, "Books"),
    (103, 1, "2023-01-17", 75.00, "Clothing"),
    (104, 3, "2023-01-18", 400.00, "Electronics"),
    (105, 4, "2023-01-19", 30.00, "Books"),
    (106, 2, "2023-01-20", 200.00, "Electronics"),
    (107, 5, "2023-01-21", 90.00, "Clothing"),
    (108, 3, "2023-01-22", 180.00, "Books"),
    (109, 1, "2023-01-23", 320.00, "Electronics"),
    (110, 4, "2023-01-24", 45.00, "Clothing")
]

order_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("order_date", StringType(), True),
    StructField("amount", DoubleType(), True),
    StructField("category", StringType(), True)
])

order_df = spark.createDataFrame(order_data, order_schema)
print("Order data created:")
order_df.show()

In [None]:
# Create Hive tables
# Create customers table
customer_df.write \
    .mode("overwrite") \
    .option("path", "hdfs://namenode:9000/user/hive/warehouse/bigdata_demo.db/customers") \
    .saveAsTable("customers")

# Create orders table
order_df.write \
    .mode("overwrite") \
    .option("path", "hdfs://namenode:9000/user/hive/warehouse/bigdata_demo.db/orders") \
    .saveAsTable("orders")

print("Hive tables created successfully!")

# Show tables in the database
print("\nTables in bigdata_demo database:")
spark.sql("SHOW TABLES").show()

In [None]:
# Basic SQL queries
print("All customers:")
spark.sql("SELECT * FROM customers").show()

print("\nAll orders:")
spark.sql("SELECT * FROM orders").show()

In [None]:
# Advanced SQL queries
print("Customer order summary:")
customer_summary = spark.sql("""
    SELECT 
        c.name,
        c.state,
        c.membership,
        COUNT(o.order_id) as total_orders,
        SUM(o.amount) as total_spent,
        AVG(o.amount) as avg_order_value
    FROM customers c
    LEFT JOIN orders o ON c.customer_id = o.customer_id
    GROUP BY c.customer_id, c.name, c.state, c.membership
    ORDER BY total_spent DESC
""")

customer_summary.show()

In [None]:
# Sales by category and membership
print("Sales analysis by category and membership:")
category_analysis = spark.sql("""
    SELECT 
        c.membership,
        o.category,
        COUNT(*) as order_count,
        SUM(o.amount) as total_sales,
        AVG(o.amount) as avg_amount
    FROM customers c
    JOIN orders o ON c.customer_id = o.customer_id
    GROUP BY c.membership, o.category
    ORDER BY c.membership, total_sales DESC
""")

category_analysis.show()

In [None]:
# Window functions in SQL
print("Customer ranking by total spending:")
customer_ranking = spark.sql("""
    SELECT 
        c.name,
        c.state,
        SUM(o.amount) as total_spent,
        RANK() OVER (ORDER BY SUM(o.amount) DESC) as spending_rank,
        RANK() OVER (PARTITION BY c.state ORDER BY SUM(o.amount) DESC) as state_rank
    FROM customers c
    JOIN orders o ON c.customer_id = o.customer_id
    GROUP BY c.customer_id, c.name, c.state
    ORDER BY total_spent DESC
""")

customer_ranking.show()

In [None]:
# Create a partitioned table for better performance
print("Creating partitioned table by category:")

# First, create the data with proper partitioning
order_df.write \
    .mode("overwrite") \
    .partitionBy("category") \
    .option("path", "hdfs://namenode:9000/user/hive/warehouse/bigdata_demo.db/orders_partitioned") \
    .saveAsTable("orders_partitioned")

print("Partitioned table created!")

# Query the partitioned table
print("\nQuerying Electronics orders from partitioned table:")
spark.sql("""
    SELECT order_id, customer_id, order_date, amount 
    FROM orders_partitioned 
    WHERE category = 'Electronics'
""").show()

In [None]:
# Create a view for complex queries
spark.sql("""
    CREATE OR REPLACE TEMPORARY VIEW customer_metrics AS
    SELECT 
        c.customer_id,
        c.name,
        c.state,
        c.membership,
        COUNT(o.order_id) as order_count,
        SUM(o.amount) as total_spent,
        AVG(o.amount) as avg_order_value,
        MIN(o.order_date) as first_order,
        MAX(o.order_date) as last_order
    FROM customers c
    LEFT JOIN orders o ON c.customer_id = o.customer_id
    GROUP BY c.customer_id, c.name, c.state, c.membership
""")

print("Customer metrics view created!")

# Use the view
print("\nPremium customers with high spending:")
spark.sql("""
    SELECT name, state, total_spent, order_count
    FROM customer_metrics
    WHERE membership = 'Premium' AND total_spent > 300
    ORDER BY total_spent DESC
""").show()

In [None]:
# Export results to HDFS
print("Exporting customer summary to HDFS...")
customer_summary.write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("hdfs://namenode:9000/user/data/customer_summary")

print("Export completed!")

# Verify the export
exported_df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("hdfs://namenode:9000/user/data/customer_summary")

print("\nVerifying exported data:")
exported_df.show(3)

In [None]:
# Show table information
print("Customers table information:")
spark.sql("DESCRIBE FORMATTED customers").show(50, truncate=False)

print("\nPartitioned orders table information:")
spark.sql("SHOW PARTITIONS orders_partitioned").show()

In [None]:
# Clean up
print("Session information:")
print(f"Current database: {spark.sql('SELECT current_database()').collect()[0][0]}")
print(f"Application ID: {spark.sparkContext.applicationId}")

spark.stop()
print("\nSpark session stopped successfully!")