In [2]:
# Cell 1: Install package from GitHub
%pip install "git+https://github.com/kpopov95-code/library-pipeline.git"

StatementMeta(, 8c008e59-c68d-4cfd-97f6-0062a3cdd199, 15, Finished, Available, Finished)

Collecting git+https://github.com/kpopov95-code/library-pipeline.git
  Cloning https://github.com/kpopov95-code/library-pipeline.git to /tmp/pip-req-build-2di219s3
  Running command git clone --filter=blob:none --quiet https://github.com/kpopov95-code/library-pipeline.git /tmp/pip-req-build-2di219s3
  Resolved https://github.com/kpopov95-code/library-pipeline.git to commit 1331b993ceedc34e0194943eac5bb38d7123e06e
  Installing build dependencies ... [?25l- \ | / - done
[?25h  Getting requirements to build wheel ... [?25l- \ done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- \ done
[?25hBuilding wheels for collected packages: library-pipeline
  Building wheel for library-pipeline (pyproject.toml) ... [?25l- \ | done
[?25h  Created wheel for library-pipeline: filename=library_pipeline-0.1.0-py3-none-any.whl size=5464 sha256=978b231784b6e5b65194f59e6926699831d50214378f61efaf67bf5900720228
  Stored in directory: /tmp/pip-ephem-wheel-cache-5ujt13cd/whe

In [3]:
from data_processing.ingestion import load_csv, load_json
from data_processing.cleaning import (
    remove_duplicates, 
    handle_missing_values, 
    standardize_dates
)

print("✅ Package installed and imported successfully!")

StatementMeta(, 8c008e59-c68d-4cfd-97f6-0062a3cdd199, 17, Finished, Available, Finished)

✅ Package installed and imported successfully!


In [4]:
# Cell 3: Load data from Lakehouse Files
import pandas as pd

# Read CSV from Files
file_path = "/lakehouse/default/Files/bronze/circulation_data.csv"
df_raw = pd.read_csv(file_path)

print(f"Loaded {len(df_raw)} rows")
print(df_raw.head())

StatementMeta(, 8c008e59-c68d-4cfd-97f6-0062a3cdd199, 18, Finished, Available, Finished)

Loaded 5100 rows
  transaction_id member_id  ... return_date branch_id
0      TXN000000    M93810  ...  2024-08-25     BR012
1      TXN000001    M28289  ...  2024-09-02     BR011
2      TXN000002    M21395  ...         NaN     BR001
3      TXN000003    M38657  ...         NaN     BR010
4      TXN000004    M36062  ...  2025-02-16     BR012

[5 rows x 6 columns]


In [5]:
# Cell 4: Apply your cleaning functions (BRONZE → SILVER)
print("Applying data cleaning pipeline...")

# Remove duplicates
df_clean = remove_duplicates(df_raw, subset=['transaction_id'])
print(f"After removing duplicates: {len(df_clean)} rows")

# Handle missing values
df_clean = handle_missing_values(df_clean, strategy='drop')
print(f"After handling missing values: {len(df_clean)} rows")

# Standardize dates
df_clean = standardize_dates(df_clean, ['checkout_date', 'return_date'])
print("Dates standardized")

print(f"\n✅ Cleaning complete! {len(df_raw)} → {len(df_clean)} rows")

StatementMeta(, 8c008e59-c68d-4cfd-97f6-0062a3cdd199, 19, Finished, Available, Finished)

Applying data cleaning pipeline...
After removing duplicates: 5000 rows
After handling missing values: 4227 rows
Dates standardized

✅ Cleaning complete! 5100 → 4227 rows


In [6]:
# Cell 5: Save as Delta table (SILVER layer)
#from pyspark.sql import SparkSession
#spark = SparkSession.builder.getOrCreate()

# Convert pandas to Spark DataFrame
df_spark = spark.createDataFrame(df_clean)

# Write as Delta table
table_name = "silver_circulation"
df_spark.write.format("delta").mode("overwrite").saveAsTable(table_name)

print(f"✅ Created Delta table: {table_name}")

StatementMeta(, 8c008e59-c68d-4cfd-97f6-0062a3cdd199, 20, Finished, Available, Finished)

✅ Created Delta table: silver_circulation


In [7]:
# Cell 6: Query the Delta table
query = f"""
SELECT 
    COUNT(*) as total_transactions,
    COUNT(DISTINCT member_id) as unique_members,
    COUNT(DISTINCT isbn) as unique_books,
    COUNT(DISTINCT branch_id) as branches
FROM {table_name}
"""

result = spark.sql(query)
result.show()

print("✅ Silver layer ready for analysis!")

StatementMeta(, 8c008e59-c68d-4cfd-97f6-0062a3cdd199, 21, Finished, Available, Finished)

+------------------+--------------+------------+--------+
|total_transactions|unique_members|unique_books|branches|
+------------------+--------------+------------+--------+
|              4227|          4127|        4227|      30|
+------------------+--------------+------------+--------+

✅ Silver layer ready for analysis!
