# Transform and cleanse data for **Silver** table


In [1]:
BRONZE_DATA_PATH: str = "Files/bronze/nishiodens/japan-real-estate-transaction-prices/trade_prices"

StatementMeta(, 6f4bcf72-ab39-4697-a02b-4836d8a46c43, 3, Finished, Available, Finished)

In [2]:
print(BRONZE_DATA_PATH)

StatementMeta(, 6f4bcf72-ab39-4697-a02b-4836d8a46c43, 4, Finished, Available, Finished)

Files/bronze/nishiodens/japan-real-estate-transaction-prices/trade_prices


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

StatementMeta(, 6f4bcf72-ab39-4697-a02b-4836d8a46c43, 5, Finished, Available, Finished)

In [4]:
from pyspark.sql.types import *

bronze_schema = StructType([
    StructField("No", IntegerType(), True),
    StructField("Type", StringType(), True),
    StructField("Region", StringType(), True),
    StructField("MunicipalityCode", StringType(), True),
    StructField("Prefecture", StringType(), True),
    StructField("Municipality", StringType(), True),
    StructField("DistrictName", StringType(), True),
    StructField("NearestStation", StringType(), True),
    StructField("TimeToNearestStation", StringType(), True),
    StructField("MinTimeToNearestStation", IntegerType(), True),
    StructField("MaxTimeToNearestStation", IntegerType(), True),
    StructField("TradePrice", LongType(), True),
    StructField("FloorPlan", StringType(), True),
    StructField("Area", DoubleType(), True),
    StructField("AreaIsGreaterFlag", BooleanType(), True),
    StructField("UnitPrice", LongType(), True),
    StructField("PricePerTsubo", DoubleType(), True),
    StructField("LandShape", StringType(), True),
    StructField("Frontage", DoubleType(), True),
    StructField("FrontageIsGreaterFlag", BooleanType(), True),
    StructField("TotalFloorArea", DoubleType(), True),
    StructField("TotalFloorAreaIsGreaterFlag", BooleanType(), True),
    StructField("BuildingYear", IntegerType(), True),
    StructField("PrewarBuilding", BooleanType(), True),
    StructField("Structure", StringType(), True),
    StructField("Use", StringType(), True),
    StructField("Purpose", StringType(), True),
    StructField("Direction", StringType(), True),
    StructField("Classification", StringType(), True),
    StructField("Breadth", DoubleType(), True),
    StructField("CityPlanning", StringType(), True),
    StructField("CoverageRatio", DoubleType(), True),
    StructField("FloorAreaRatio", DoubleType(), True),
    StructField("Period", StringType(), True),
    StructField("Year", IntegerType(), True),
    StructField("Quarter", StringType(), True),
    StructField("Renovation", StringType(), True),
    StructField("Remarks", StringType(), True)
])


StatementMeta(, 6f4bcf72-ab39-4697-a02b-4836d8a46c43, 6, Finished, Available, Finished)

In [5]:
df_bronze = spark.read.format("csv").options(header=True, schema=bronze_schema).load(f"{BRONZE_DATA_PATH}/*.csv")

StatementMeta(, 6f4bcf72-ab39-4697-a02b-4836d8a46c43, 7, Finished, Available, Finished)

In [8]:
display(df_bronze.limit(3))

display(df_bronze.count())

StatementMeta(, 6f4bcf72-ab39-4697-a02b-4836d8a46c43, 10, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, b9329cff-1bda-4b02-9245-6810aeaf315e)

3906518

In [9]:
df_bronze.printSchema()

StatementMeta(, 6f4bcf72-ab39-4697-a02b-4836d8a46c43, 11, Finished, Available, Finished)

root
 |-- No: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- MunicipalityCode: string (nullable = true)
 |-- Prefecture: string (nullable = true)
 |-- Municipality: string (nullable = true)
 |-- DistrictName: string (nullable = true)
 |-- NearestStation: string (nullable = true)
 |-- TimeToNearestStation: string (nullable = true)
 |-- MinTimeToNearestStation: string (nullable = true)
 |-- MaxTimeToNearestStation: string (nullable = true)
 |-- TradePrice: string (nullable = true)
 |-- FloorPlan: string (nullable = true)
 |-- Area: string (nullable = true)
 |-- AreaIsGreaterFlag: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- PricePerTsubo: string (nullable = true)
 |-- LandShape: string (nullable = true)
 |-- Frontage: string (nullable = true)
 |-- FrontageIsGreaterFlag: string (nullable = true)
 |-- TotalFloorArea: string (nullable = true)
 |-- TotalFloorAreaIsGreaterFlag: string (nullable = true)
 |-- Bui

## Create stats table for EDA

In [10]:
from pyspark.sql.functions import col

total_rows = df_bronze.count()
print(f"The bronze df has {total_rows} rows in total.")


StatementMeta(, 6f4bcf72-ab39-4697-a02b-4836d8a46c43, 12, Finished, Available, Finished)

The bronze df has 3906518 rows in total.


In [15]:
from pyspark.sql.functions import col, when, isnan, isnull

def calc_stats(df):
    """
    This function is used to summarize useful statistical columns for data in a spark df
    """
    stats_list = []
    for col_name in df.columns:
        
        empty_string_count = df.filter(col(col_name) == "").count()
        null_count = df.filter(col(col_name).isNull()).count()

        missing_count = empty_string_count + null_count
        missing_percentage = (missing_count / total_rows) * 100

        unique_count = df.select(col_name).distinct().count()
        
        data_type = dict(df.dtypes)[col_name]
        

        stats_list.append((
            col_name,
            data_type,
            missing_count,
            missing_percentage,
            null_count,
            empty_string_count,
            unique_count
        ))

    # Create DataFrame with comprehensive statistics
    stats_df = spark.createDataFrame(stats_list, [
            "col_name",
            "data_type",
            "missing_count",
            "missing_percentage",
            "null_count",
            "empty_string_count",
            "unique_count"
    ])
    return stats_df

StatementMeta(, 6f4bcf72-ab39-4697-a02b-4836d8a46c43, 17, Finished, Available, Finished)

In [16]:
bronze_stats_df = calc_stats(df_bronze)

StatementMeta(, 6f4bcf72-ab39-4697-a02b-4836d8a46c43, 18, Finished, Available, Finished)

In [17]:
from pyspark.sql.functions import desc
display(bronze_stats_df.orderBy(desc("missing_percentage")))

StatementMeta(, 6f4bcf72-ab39-4697-a02b-4836d8a46c43, 19, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a1d2b0f8-8f2f-47ed-aa78-3c28452ab8d2)

In [35]:
bronze_stats_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("bronze_profile")

StatementMeta(, 6f4bcf72-ab39-4697-a02b-4836d8a46c43, 37, Finished, Available, Finished)

In [26]:
columns_to_drop = []
missing_data = bronze_stats_df.select(col("col_name"), col("missing_percentage"))

print("Column with high missing value percentage:")
columns_to_drop = missing_data.filter(col("missing_percentage") >= 60.0)
display(columns_to_drop.orderBy(desc("missing_percentage")))

# display(columns_to_drop.select("col_name"))

StatementMeta(, 6f4bcf72-ab39-4697-a02b-4836d8a46c43, 28, Finished, Available, Finished)

Column with high missing value percentage:


SynapseWidget(Synapse.DataFrame, 97047189-e4dc-4050-bb7c-7343c5fcdbef)

In [31]:

df_dropped_missing = df_bronze.drop(*[row.col_name for row in columns_to_drop.select("col_name").collect()])
display(df_dropped_missing.limit(2))

StatementMeta(, 6f4bcf72-ab39-4697-a02b-4836d8a46c43, 33, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 4247212a-9643-43a0-87bf-dd77fded967e)

## Manually drop some unused columns

In [32]:
df_dropped_missing.printSchema()

StatementMeta(, 6f4bcf72-ab39-4697-a02b-4836d8a46c43, 34, Finished, Available, Finished)

root
 |-- No: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- MunicipalityCode: string (nullable = true)
 |-- Prefecture: string (nullable = true)
 |-- Municipality: string (nullable = true)
 |-- DistrictName: string (nullable = true)
 |-- NearestStation: string (nullable = true)
 |-- TimeToNearestStation: string (nullable = true)
 |-- MinTimeToNearestStation: string (nullable = true)
 |-- MaxTimeToNearestStation: string (nullable = true)
 |-- TradePrice: string (nullable = true)
 |-- Area: string (nullable = true)
 |-- AreaIsGreaterFlag: string (nullable = true)
 |-- LandShape: string (nullable = true)
 |-- Frontage: string (nullable = true)
 |-- FrontageIsGreaterFlag: string (nullable = true)
 |-- TotalFloorAreaIsGreaterFlag: string (nullable = true)
 |-- BuildingYear: string (nullable = true)
 |-- PrewarBuilding: string (nullable = true)
 |-- Structure: string (nullable = true)
 |-- Use: string (nullable = true)
 |-- Direction:

In [36]:
columns_not_included = [
    "No",
    "Region",
    "MunicipalityCode",
    "DistrictName",
    "NearestStation",
    "TimeToNearestStation",
    "MinTimeToNearestStation",
    "MaxTimeToNearestStation",
    "Area",
    "AreaIsGreaterFlag",
    "Frontage",
    "FrontageIsGreaterFlag",
    "TotalFloorAreaIsGreaterFlag",
    "PrewarBuilding",
    "Breadth",
    "CoverageRatio",
    "FloorAreaRatio",
    "Period"
]

StatementMeta(, 6f4bcf72-ab39-4697-a02b-4836d8a46c43, 38, Finished, Available, Finished)

In [37]:
df_distilled = df_dropped_missing.drop(*columns_not_included)
display(df_distilled.orderBy(["Year", "Quarter"], ascending=[False, False]))

StatementMeta(, 6f4bcf72-ab39-4697-a02b-4836d8a46c43, 39, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 307eee8e-7c51-4f60-b037-1aa5037b76bb)