# Generate **dimension tables in Gold layer** for Japan Real Estate Dataset

In [2]:
config_path: str = "/Files/config/gold_dim.json"

StatementMeta(, b675733d-6f45-45ea-ae46-ee025fb85f4a, 4, Finished, Available, Finished)

In [1]:
import json
import hashlib
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import logging


StatementMeta(, b675733d-6f45-45ea-ae46-ee025fb85f4a, 3, Finished, Available, Finished)

In [3]:

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

StatementMeta(, b675733d-6f45-45ea-ae46-ee025fb85f4a, 5, Finished, Available, Finished)

In [4]:
try:
    config_path = "/lakehouse/default/" + config_path if config_path.find("lakehouse/default") < 0 else config_path
    with open(config_path, 'r') as file:
        config = json.load(file)
    logger.info("Configuration loaded successfully")
except Exception as e:
    logger.error(f"Failed to load configuration: {str(e)}")
    raise

StatementMeta(, b675733d-6f45-45ea-ae46-ee025fb85f4a, 6, Finished, Available, Finished)

INFO:__main__:Configuration loaded successfully


In [5]:
source_lakehouse = config["Gold_Dim_Configuration"]["SourceLakehouse"]
silver_table = config["Gold_Dim_Configuration"]["SilverTableName"]
# tables_config = json.dumps(config["Tables"], indent = 4)
tables_config = config["Tables"]


print(f"Source Lakehouse: \n {source_lakehouse}")
print(f"Silver Table Name: \n {silver_table}")
print(f"Table Configuration Guide: \n {tables_config}")

StatementMeta(, b675733d-6f45-45ea-ae46-ee025fb85f4a, 7, Finished, Available, Finished)

Source Lakehouse: 
 japan_real_estate_lh
Silver Table Name: 
 silver_data
Table Configuration Guide: 
 [{'TableName': 'dim_building', 'TableType': 'Dimension', 'ColumnDefinitions': 'BuildingSK VARCHAR(255), Structure VARCHAR(255), Direction VARCHAR(255), Renovation VARCHAR(255)', 'DataColumns': 'Structure, Direction, Renovation', 'KeyColumnName': 'BuildingSK'}, {'TableName': 'dim_location', 'TableType': 'Dimension', 'ColumnDefinitions': 'LocationSK VARCHAR(255), Prefecture VARCHAR(255), Municipality VARCHAR(255), CityPlanning VARCHAR(255), Classification VARCHAR(255)', 'DataColumns': 'Prefecture, Municipality, CityPlanning, Classification', 'KeyColumnName': 'LocationSK'}, {'TableName': 'dim_land', 'TableType': 'Dimension', 'ColumnDefinitions': 'LandSK VARCHAR(255), Type VARCHAR(255), LandShape VARCHAR(255)', 'DataColumns': 'Type, LandShape', 'KeyColumnName': 'LandSK'}]


In [7]:
print(type(tables_config))

StatementMeta(, b675733d-6f45-45ea-ae46-ee025fb85f4a, 9, Finished, Available, Finished)

<class 'list'>


In [8]:
print(json.dumps(tables_config, indent = 4))

StatementMeta(, b675733d-6f45-45ea-ae46-ee025fb85f4a, 10, Finished, Available, Finished)

[
    {
        "TableName": "dim_building",
        "TableType": "Dimension",
        "ColumnDefinitions": "BuildingSK VARCHAR(255), Structure VARCHAR(255), Direction VARCHAR(255), Renovation VARCHAR(255)",
        "DataColumns": "Structure, Direction, Renovation",
        "KeyColumnName": "BuildingSK"
    },
    {
        "TableName": "dim_location",
        "TableType": "Dimension",
        "ColumnDefinitions": "LocationSK VARCHAR(255), Prefecture VARCHAR(255), Municipality VARCHAR(255), CityPlanning VARCHAR(255), Classification VARCHAR(255)",
        "DataColumns": "Prefecture, Municipality, CityPlanning, Classification",
        "KeyColumnName": "LocationSK"
    },
    {
        "TableName": "dim_land",
        "TableType": "Dimension",
        "ColumnDefinitions": "LandSK VARCHAR(255), Type VARCHAR(255), LandShape VARCHAR(255)",
        "DataColumns": "Type, LandShape",
        "KeyColumnName": "LandSK"
    }
]


## load silver table

In [9]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.appName("GenerateDimTables").getOrCreate()


StatementMeta(, b675733d-6f45-45ea-ae46-ee025fb85f4a, 11, Finished, Available, Finished)

In [10]:
silver_df = spark.read.format("delta").options(header=True, inferSchema=True).load("Tables/dbo/silver_data")

StatementMeta(, b675733d-6f45-45ea-ae46-ee025fb85f4a, 12, Finished, Available, Finished)

In [11]:
silver_df.printSchema()

StatementMeta(, b675733d-6f45-45ea-ae46-ee025fb85f4a, 13, Finished, Available, Finished)

root
 |-- Type: string (nullable = true)
 |-- Prefecture: string (nullable = true)
 |-- Municipality: string (nullable = true)
 |-- TradePrice: double (nullable = true)
 |-- LandShape: string (nullable = false)
 |-- TotalFloorArea: double (nullable = true)
 |-- BuildingYear: integer (nullable = true)
 |-- Structure: string (nullable = false)
 |-- Use: string (nullable = false)
 |-- Direction: string (nullable = false)
 |-- Classification: string (nullable = false)
 |-- CityPlanning: string (nullable = false)
 |-- Year: integer (nullable = true)
 |-- Quarter: integer (nullable = true)
 |-- Renovation: string (nullable = false)
 |-- ID: string (nullable = true)



In [12]:
display(silver_df.limit(3))

StatementMeta(, b675733d-6f45-45ea-ae46-ee025fb85f4a, 14, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a3b0be9c-9b1c-4b46-88e6-ed40602d5e05)

In [13]:
# from pyspark.sql.functions import col, split, explode, trim, collect_set
# from pyspark.sql.types import StringType

# unique_uses = (df_fillna_cat
#                .select(explode(split(col("Use"), ",")).alias("individual_use"))
#                .select(trim(col("individual_use")).alias("use_trimmed"))
#             #    .filter(col("use_trimmed") != "")
#                .select(collect_set("use_trimmed").alias("unique_uses"))
#                .collect()[0]["unique_uses"])

StatementMeta(, 8e2318db-aef5-4365-86fb-e852c3b63fad, 15, Finished, Available, Finished)

## Execute column validation before creating dim tables

In [20]:
logging.info(f"Starting column validation for '{silver_table}' in '{source_lakehouse}'...")

silver_df_columns = [col for col in silver_df.columns]

logging.info(f"Actual columns in {silver_table}: {silver_df_columns} \n length: {len(silver_df_columns)}")

config_column_list = []
for dim_table_config in tables_config:
    data_columns = dim_table_config["DataColumns"].split(",")
    config_column_list.extend(data_columns)

logging.info(f"Columns in config file: {config_column_list} \n length: {len(config_column_list)}")



StatementMeta(, b675733d-6f45-45ea-ae46-ee025fb85f4a, 22, Finished, Available, Finished)

INFO:root:Starting column validation for 'silver_data' in 'japan_real_estate_lh'...
INFO:root:Actual columns in silver_data: ['Type', 'Prefecture', 'Municipality', 'TradePrice', 'LandShape', 'TotalFloorArea', 'BuildingYear', 'Structure', 'Use', 'Direction', 'Classification', 'CityPlanning', 'Year', 'Quarter', 'Renovation', 'ID'] 
 length: 16
INFO:root:Columns in config file: ['Structure', ' Direction', ' Renovation', 'Prefecture', ' Municipality', ' CityPlanning', ' Classification', 'Type', ' LandShape'] 
 length: 9


## Dim tables generator

In [25]:

from concurrent.futures import ThreadPoolExecutor, as_completed
from pyspark.sql.functions import col, monotonically_increasing_id, row_number, concat_ws, lit
from pyspark.sql.window import Window
from datetime import datetime
import logging


class DimTableGenerator:
    def __init__(self, spark_session, tables_config):
        self.spark = spark_session
        self.config = tables_config
        self.source_lakehouse = self.config["Gold_Dim_Configuration"]["SourceLakehouse"]
        self.silver_table = self.config["Gold_Dim_Configuration"]["SilverTableName"]
        self.silver_df = self.spark.read.format("delta").options(header=True, inferSchema=True).load(f"Tables/dbo/{self.silver_table}")
        
    def generate_monotonic_id(self, df, key_column_name, prefix=""):
        window_spec = Window.orderBy(monotonically_increasing_id())
        
        df_with_id = df.withColumn(
            "temp_row_num", 
            row_number().over(window_spec)
        )
        
        # add prefix if spec, example "land_1"
        if prefix:
            df_with_id = df_with_id.withColumn(
                key_column_name,
                concat_ws("_", lit(prefix), col("temp_row_num").cast("string"))
            )
        else:
            df_with_id = df_with_id.withColumn(
                key_column_name,
                col("temp_row_num").cast("string")
            )
        
        return df_with_id.drop("temp_row_num")
    
    def create_dimension_table(self, table_info):
        """Create a single dimension table"""
        try:
            table_name = table_info["TableName"]
            data_columns = [col.strip() for col in table_info["DataColumns"].split(",")]
            key_column = table_info["KeyColumnName"]
            
            logger.info(f"Starting creation of {table_name}")

            # Get distinct combinations of the dimension attributes
            dim_df = self.silver_df.select(*data_columns).distinct()
            
            dim_df_with_id = self.generate_monotonic_id(
                dim_df, 
                key_column, 
                prefix=table_name.replace("dim_", "").upper()
            )
            
            columns_ordered = [key_column] + data_columns
            dim_df_final = dim_df_with_id.select(*columns_ordered)
            
            dim_df_final.write \
                .mode("overwrite") \
                .option("overwriteSchema", "true") \
                .saveAsTable(table_name)
            
            row_count = dim_df_final.count()
            
            logger.info(f"Successfully created {table_name} with {row_count} rows")
            
            return {
                "table_name": table_name,
                "status": "success",
                "row_count": row_count,
                "columns": columns_ordered
            }
            
        except Exception as e:
            logger.error(f"Error creating {table_name}: {str(e)}")
            return {
                "table_name": table_name,
                "status": "error",
                "error_message": str(e)
            }
    
    def create_all_dimension_tables_concurrent(self, max_workers=3):
        tables_to_create = self.config["Tables"]
        results = []
        
        
        logger.info(f"Starting concurrent creation of {len(tables_to_create)} dimension tables")
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # submit concurrent tasks
            future_to_table = {
                executor.submit(self.create_dimension_table, table_info): table_info["TableName"]
                for table_info in tables_to_create
            }
            
            # Process completed tasks
            for future in as_completed(future_to_table):
                table_name = future_to_table[future]
                try:
                    result = future.result()
                    results.append(result)
                    
                    if result["status"] == "success":
                        logger.info(f"SUCCESS: {table_name} completed")
                    else:
                        logger.error(f"FAILURE: {table_name} failed with the below error: \n {result['error_message']}")
                        
                except Exception as e:
                    logger.error(f"FAILURE: {table_name} failed with exception: \n {str(e)}")
                    results.append({
                        "table_name": table_name,
                        "status": "error",
                        "error_message": str(e)
                    })
        
        return results
    
    def print_summary(self, results):
        successful = [r for r in results if r["status"] == "success"]
        failed = [r for r in results if r["status"] == "error"]
        
        print("\n" + "="*60)
        print("DIMENSION TABLE CREATION SUMMARY")
        print("="*60)
        print(f"Total tables: {len(results)}")
        print(f"Successful: {len(successful)}")
        print(f"Failed: {len(failed)}")
        
        if successful:
            print("\n SUCCESSFUL TABLES:")
            for result in successful:
                print(f"  - {result['table_name']}: {result['row_count']} rows")
        
        if failed:
            print("\n FAILED TABLES:")
            for result in failed:
                print(f"  - {result['table_name']}: {result['error_message']}")
        
        print("="*60)

StatementMeta(, b675733d-6f45-45ea-ae46-ee025fb85f4a, 27, Finished, Available, Finished)

In [26]:
dim_generator = DimTableGenerator(spark, config)

StatementMeta(, b675733d-6f45-45ea-ae46-ee025fb85f4a, 28, Finished, Available, Finished)

In [27]:
num_workers = len(tables_config)
print(num_workers)

StatementMeta(, b675733d-6f45-45ea-ae46-ee025fb85f4a, 29, Finished, Available, Finished)

3


In [28]:
results = dim_generator.create_all_dimension_tables_concurrent(max_workers=num_workers)

StatementMeta(, b675733d-6f45-45ea-ae46-ee025fb85f4a, 30, Finished, Available, Finished)

INFO:__main__:Starting concurrent creation of 3 dimension tables
INFO:__main__:Starting creation of dim_building
INFO:__main__:Starting creation of dim_location
INFO:__main__:Starting creation of dim_land
INFO:__main__:Successfully created dim_building with 278 rows
INFO:__main__:SUCCESS: dim_building completed
INFO:__main__:Successfully created dim_land with 23 rows
INFO:__main__:SUCCESS: dim_land completed
INFO:__main__:Successfully created dim_location with 78413 rows
INFO:__main__:SUCCESS: dim_location completed


In [29]:
dim_generator.print_summary(results)


StatementMeta(, b675733d-6f45-45ea-ae46-ee025fb85f4a, 31, Finished, Available, Finished)


DIMENSION TABLE CREATION SUMMARY
Total tables: 3
Successful: 3
Failed: 0

 SUCCESSFUL TABLES:
  - dim_building: 278 rows
  - dim_land: 23 rows
  - dim_location: 78413 rows
