# Full Stack Big Data Integration

This notebook demonstrates the complete integration of Hadoop, Spark, and Hive working together.


In [None]:
# Import all required libraries
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re

print("All libraries imported successfully!")

In [None]:
# Create comprehensive Spark session
spark = SparkSession.builder \
    .appName("Full-Stack-BigData-Integration") \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .enableHiveSupport() \
    .getOrCreate()

print(f"Spark Version: {spark.version}")
print(f"Master: {spark.sparkContext.master}")
print(f"Application ID: {spark.sparkContext.applicationId}")
print(f"Web UI: {spark.sparkContext.uiWebUrl}")

In [None]:
# Load sample data from the mounted data directory
print("Loading sample datasets...")

# Load users CSV
users_df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/home/jovyan/data/samples/users.csv")

print("Users data:")
users_df.show(5)

# Load transactions JSON
transactions_df = spark.read \
    .option("multiline", "true") \
    .json("/home/jovyan/data/samples/transactions.json")

print("\nTransactions data:")
transactions_df.show(5)

In [None]:
# Process log data
print("Processing log data...")

# Read log file as text
log_rdd = spark.sparkContext.textFile("/home/jovyan/data/samples/logs.txt")

# Parse log entries using regex
log_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) (\w+)\s+\[([^\]]+)\] (.*)'

def parse_log_line(line):
    match = re.match(log_pattern, line)
    if match:
        return (match.group(1), match.group(2), match.group(3), match.group(4))
    return None

parsed_logs = log_rdd.map(parse_log_line).filter(lambda x: x is not None)

# Convert to DataFrame
log_schema = StructType([
    StructField("timestamp", StringType(), True),
    StructField("level", StringType(), True),
    StructField("thread", StringType(), True),
    StructField("message", StringType(), True)
])

logs_df = spark.createDataFrame(parsed_logs, log_schema)
print("Parsed log data:")
logs_df.show(5, truncate=False)

In [None]:
# Save all data to HDFS
print("Saving data to HDFS...")

# Save users to HDFS
users_df.write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("hdfs://namenode:9000/user/data/users")

# Save transactions to HDFS (partitioned by category)
transactions_df.write \
    .mode("overwrite") \
    .partitionBy("category") \
    .parquet("hdfs://namenode:9000/user/data/transactions")

# Save logs to HDFS (partitioned by level)
logs_df.write \
    .mode("overwrite") \
    .partitionBy("level") \
    .parquet("hdfs://namenode:9000/user/data/logs")

print("All data saved to HDFS successfully!")

In [None]:
# Create Hive database and tables
print("Creating Hive database and tables...")

spark.sql("CREATE DATABASE IF NOT EXISTS analytics")
spark.sql("USE analytics")

# Create users table
users_df.write \
    .mode("overwrite") \
    .saveAsTable("users")

# Create transactions table
transactions_df.write \
    .mode("overwrite") \
    .saveAsTable("transactions")

# Create logs table
logs_df.write \
    .mode("overwrite") \
    .saveAsTable("system_logs")

print("Hive tables created!")
spark.sql("SHOW TABLES").show()

In [None]:
# Comprehensive analytics using SQL
print("Running comprehensive analytics...")

# User transaction analysis
user_analysis = spark.sql("""
    SELECT 
        u.name,
        u.city,
        u.salary,
        COUNT(t.transaction_id) as transaction_count,
        SUM(t.amount) as total_spent,
        AVG(t.amount) as avg_transaction,
        MAX(t.amount) as max_transaction,
        COLLECT_LIST(t.category) as categories
    FROM users u
    LEFT JOIN transactions t ON CONCAT('U', LPAD(CAST(ROW_NUMBER() OVER (ORDER BY u.name) AS STRING), 3, '0')) = t.user_id
    GROUP BY u.name, u.city, u.salary
    ORDER BY total_spent DESC NULLS LAST
""")

print("User transaction analysis:")
user_analysis.show(10, truncate=False)

In [None]:
# Category spending analysis
category_analysis = spark.sql("""
    SELECT 
        category,
        COUNT(*) as transaction_count,
        SUM(amount) as total_amount,
        AVG(amount) as avg_amount,
        MIN(amount) as min_amount,
        MAX(amount) as max_amount,
        STDDEV(amount) as amount_stddev
    FROM transactions
    GROUP BY category
    ORDER BY total_amount DESC
""")

print("Category spending analysis:")
category_analysis.show()

# Convert to Pandas for visualization
category_pandas = category_analysis.toPandas()

In [None]:
# System logs analysis
logs_analysis = spark.sql("""
    SELECT 
        level,
        thread,
        COUNT(*) as log_count,
        COUNT(DISTINCT DATE(timestamp)) as active_days
    FROM system_logs
    GROUP BY level, thread
    ORDER BY log_count DESC
""")

print("System logs analysis:")
logs_analysis.show()

# Error analysis
error_analysis = spark.sql("""
    SELECT 
        timestamp,
        thread,
        message
    FROM system_logs
    WHERE level = 'ERROR'
    ORDER BY timestamp
""")

print("\nError logs:")
error_analysis.show(truncate=False)

In [None]:
# Advanced analytics with window functions
print("Advanced analytics with window functions...")

daily_trends = spark.sql("""
    SELECT 
        DATE(timestamp) as transaction_date,
        category,
        SUM(amount) as daily_total,
        COUNT(*) as daily_count,
        AVG(amount) as daily_avg,
        SUM(SUM(amount)) OVER (
            PARTITION BY category 
            ORDER BY DATE(timestamp) 
            ROWS UNBOUNDED PRECEDING
        ) as running_total
    FROM transactions
    GROUP BY DATE(timestamp), category
    ORDER BY transaction_date, category
""")

print("Daily transaction trends:")
daily_trends.show(20)

In [None]:
# Data quality checks
print("Data quality assessment...")

# Check for data completeness
quality_check = spark.sql("""
    SELECT 
        'users' as table_name,
        COUNT(*) as total_records,
        SUM(CASE WHEN name IS NULL THEN 1 ELSE 0 END) as null_names,
        SUM(CASE WHEN salary IS NULL THEN 1 ELSE 0 END) as null_salaries
    FROM users
    
    UNION ALL
    
    SELECT 
        'transactions' as table_name,
        COUNT(*) as total_records,
        SUM(CASE WHEN user_id IS NULL THEN 1 ELSE 0 END) as null_user_ids,
        SUM(CASE WHEN amount IS NULL OR amount <= 0 THEN 1 ELSE 0 END) as invalid_amounts
    FROM transactions
    
    UNION ALL
    
    SELECT 
        'system_logs' as table_name,
        COUNT(*) as total_records,
        SUM(CASE WHEN timestamp IS NULL THEN 1 ELSE 0 END) as null_timestamps,
        SUM(CASE WHEN level IS NULL THEN 1 ELSE 0 END) as null_levels
    FROM system_logs
""")

print("Data quality summary:")
quality_check.show()

In [None]:
# Create comprehensive visualizations
plt.figure(figsize=(15, 10))

# Subplot 1: Category spending
plt.subplot(2, 3, 1)
plt.bar(category_pandas['category'], category_pandas['total_amount'])
plt.title('Total Spending by Category')
plt.xlabel('Category')
plt.ylabel('Total Amount ($)')
plt.xticks(rotation=45)

# Subplot 2: Transaction count by category
plt.subplot(2, 3, 2)
plt.pie(category_pandas['transaction_count'], labels=category_pandas['category'], autopct='%1.1f%%')
plt.title('Transaction Distribution by Category')

# Subplot 3: Average transaction amount
plt.subplot(2, 3, 3)
plt.bar(category_pandas['category'], category_pandas['avg_amount'])
plt.title('Average Transaction Amount by Category')
plt.xlabel('Category')
plt.ylabel('Average Amount ($)')
plt.xticks(rotation=45)

# Get log level data for visualization
log_level_data = spark.sql("SELECT level, COUNT(*) as count FROM system_logs GROUP BY level").toPandas()

# Subplot 4: Log levels distribution
plt.subplot(2, 3, 4)
plt.bar(log_level_data['level'], log_level_data['count'])
plt.title('Log Entries by Level')
plt.xlabel('Log Level')
plt.ylabel('Count')

# Subplot 5: User salary distribution
salary_data = spark.sql("SELECT salary FROM users").toPandas()
plt.subplot(2, 3, 5)
plt.hist(salary_data['salary'], bins=8, alpha=0.7)
plt.title('User Salary Distribution')
plt.xlabel('Salary ($)')
plt.ylabel('Frequency')

# Subplot 6: Transaction amount distribution
amount_data = spark.sql("SELECT amount FROM transactions").toPandas()
plt.subplot(2, 3, 6)
plt.hist(amount_data['amount'], bins=10, alpha=0.7, color='green')
plt.title('Transaction Amount Distribution')
plt.xlabel('Amount ($)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Export final results to HDFS
print("Exporting final analysis results...")

# Export user analysis
user_analysis.write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("hdfs://namenode:9000/user/analytics/user_analysis")

# Export category analysis
category_analysis.write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("hdfs://namenode:9000/user/analytics/category_analysis")

# Export daily trends
daily_trends.write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("hdfs://namenode:9000/user/analytics/daily_trends")

print("All analysis results exported to HDFS!")

In [None]:
# Performance summary
print("=" * 60)
print("BIG DATA ENVIRONMENT INTEGRATION SUMMARY")
print("=" * 60)
print(f"✅ Spark Version: {spark.version}")
print(f"✅ Application ID: {spark.sparkContext.applicationId}")
print(f"✅ Master: {spark.sparkContext.master}")
print(f"✅ Default Parallelism: {spark.sparkContext.defaultParallelism}")
print(f"✅ Web UI: {spark.sparkContext.uiWebUrl}")
print()
print("Data Processing Summary:")
print(f"✅ Users processed: {users_df.count()}")
print(f"✅ Transactions processed: {transactions_df.count()}")
print(f"✅ Log entries processed: {logs_df.count()}")
print()
print("Storage Summary:")
print("✅ Data saved to HDFS in multiple formats (CSV, Parquet)")
print("✅ Hive tables created for structured queries")
print("✅ Data partitioned for optimized queries")
print()
print("Analytics Summary:")
print("✅ User transaction analysis completed")
print("✅ Category spending analysis completed")
print("✅ System logs analysis completed")
print("✅ Data quality assessment completed")
print("✅ Visualizations generated")
print()
print("Integration Status:")
print("✅ Hadoop HDFS: Working")
print("✅ Spark Processing: Working")
print("✅ Hive Metastore: Working")
print("✅ Jupyter Environment: Working")
print("=" * 60)
print("🎉 FULL STACK BIG DATA INTEGRATION SUCCESSFUL! 🎉")
print("=" * 60)

In [None]:
# Clean up
spark.stop()
print("Spark session stopped. Integration demonstration complete!")