Analyze/profile employees and plot graphs

In [0]:
# Load the employees table from the specified catalog and schema
employees_df = spark.table('hub.genie_demo.employees')

# Display the first 5 rows to preview the data
employees_df.show(5)

display(employees_df.limit(20))

In [0]:
# Data profiling for employees table
from pyspark.sql import functions as F

# Column data types
print('Schema:')
employees_df.printSchema()

# Count missing values per column
print('\nMissing values per column:')
employees_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in employees_df.columns]).show()

# Unique value counts per column
print('\nUnique value counts:')
for c in employees_df.columns:
    employees_df.select(c).distinct().count()
    print(f"{c}: {employees_df.select(c).distinct().count()}")

# Basic statistics for numeric columns
print('\nSummary statistics:')
employees_df.describe().show()

# If ydata-profiling is available, generate a profile report
try:
    from ydata_profiling import ProfileReport
    profile = ProfileReport(employees_df.toPandas(), title="Employees Table Profile", explorative=True)
    profile.to_widgets()
except ImportError:
    print('ydata-profiling not installed. Skipping detailed profile report.')

In [0]:
import matplotlib.pyplot as plt
import pandas as pd

# Convert to pandas DataFrame for plotting
pdf = employees_df.toPandas()

# Plot hire date distribution
plt.figure(figsize=(8,4))
pdf['hire_date'] = pd.to_datetime(pdf['hire_date'])
pdf['hire_date'].hist(bins=10)
plt.title('Hire Date Distribution')
plt.xlabel('Hire Date')
plt.ylabel('Number of Employees')
plt.tight_layout()
plt.show()

# Plot store-wise employee count
plt.figure(figsize=(6,4))
pdf['store_id'].value_counts().sort_index().plot(kind='bar')
plt.title('Employee Count by Store')
plt.xlabel('Store ID')
plt.ylabel('Number of Employees')
plt.tight_layout()
plt.show()