In [2]:
import pandas as pd
import numpy as np
import findspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import pandas_udf
from pyspark.sql.functions import udf
import io
import re
import distutils
import pyarrow
import us
# import reverse_geocoder as rg
# import country_converter as coco

## Constants

In [3]:
FLOAT_DIGIT_LIMIT=7
DOUBLE_DIGIT_LIMIT=15
RATIO_SAMPLE = 1.0
ZILLOW_PATH = 'zillow/json_zillow'
REALTOR_PATH = 'realtor/json_realtor'
ATTOM_SCHEMA_PATH = 'schema_template.xlsx'
OUTPUT_PATH = 'result_table.xlsx'
ZIP_FIPS_PATH = 'ZIP_COUNTY_122023.xlsx'
HAVERSINE_PRECISION = 3 ### indicates that properties with distance within about 80 meters will be considered as the same property
# REVERSE_GEOCODER_PATH = 'rg_cities1000.csv'
# geo = rg.RGeocoder(mode=2, verbose=True, stream=io.StringIO(open(REVERSE_GEOCODER_PATH, encoding='utf-8').read()))

## Spark Initilize

In [4]:
findspark.init()

sc = SparkContext()
spark = SparkSession\
        .builder\
        .appName("property_etl")\
        .getOrCreate()

## Zillow Loading

In [5]:
zillow_read_schema = StructType(
    [
        ## address cols
        StructField(
            "address",
            StructType(
                [
                    StructField("city", StringType(), True),
                    StructField("state", StringType(), True),
                    StructField("streetAddress", StringType(), True),
                    StructField("zipcode", StringType(), True),
                ]
            ),
            True,
        ),
        StructField("countyFIPS", StringType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True),
        ## tax history cols
        StructField(
            "taxHistory",
            ArrayType(
                StructType(
                    [
                        StructField("time", LongType(), True),
                        StructField(
                            "taxPaid", IntegerType(), True
                        ),  # Assuming taxPaid is an integer, use the appropriate type if different
                        StructField("value", LongType(), True),
                    ]
                )
            ),
            True,
        ),
        ## description cols
        StructField("homeType", StringType(), True),
        StructField("homeStatus", StringType(), True),
        StructField("description", StringType(), True),
        StructField("datePosted", DateType(), True),
        StructField("yearBuilt", LongType(), True),
        StructField("price", LongType(), True),
        StructField("livingAreaValue", StringType(), True),
        StructField("livingAreaUnits", StringType(), True),
        ## Utility cols
        StructField("bathrooms", IntegerType(), True),
        StructField("bedrooms", IntegerType(), True),
        ## ResoFacts
        StructField(
            "resoFacts",
            StructType(
                [
                    StructField("yearBuilt", LongType(), True),
                    StructField("lotSize", StringType(), True),
                    StructField("hasAttachedGarage", BooleanType(), True),
                    StructField("hasCooling", BooleanType(), True),
                    StructField("hasHeating", BooleanType(), True),
                    StructField("hasPrivatePool", BooleanType(), True)
                ]
            ),
            True,
        ),
    ]
)


In [6]:
df_z_raw = spark.read \
                .option("mergeSchema", "true") \
                .option("multiLine", True) \
                .json(ZILLOW_PATH, schema=zillow_read_schema)

In [7]:
df_z_filtered = df_z_raw.select(## address cols
                                col('address').state.alias('state'), \
                                col('countyFIPS').alias('countyFIPS'), \
                                col('address').city.alias('city'), \
                                col('address').streetAddress.alias('streetAddress'), \
                                col('address').zipcode.alias('zipcode'), \
                                col('latitude').alias('latitude'), \
                                col('longitude').alias('longitude'), \
                                ## tax history cols
                                col('taxHistory').alias('taxHistory'), \
                                ## description
                                col('homeType').alias('homeType'), \
                                col('homeStatus').alias('homeStatus'), \
                                col('description').alias('textDescription'), 
                                col('datePosted').alias('datePosted'), \
                                ifnull(col('yearBuilt'), col('resoFacts').yearBuilt).alias('yearBuilt'), \
                                when(col('price') != lit(0), col('price')).otherwise(lit(None)).alias('price'), \
                                concat(col('livingAreaValue'), lit(' '), col('livingAreaUnits')).alias('livingArea'), \
                                regexp_replace(col('resoFacts').lotSize, lit(','), lit('')).alias('lotSize'), \
                                ## utility features
                                col('bathrooms').alias('bathrooms'), \
                                col('bedrooms').alias('bedrooms'), \
                                coalesce(col('resoFacts').hasAttachedGarage, lit(False)).alias('hasGarage'), \
                                coalesce(col('resoFacts').hasCooling, lit(False)).alias('hasCooling'), \
                                coalesce(col('resoFacts').hasHeating, lit(False)).alias('hasHeating'), \
                                coalesce(col('resoFacts').hasPrivatePool, lit(False)).alias('hasPrivatePool'), \
                                )
df_z_filtered = df_z_filtered.select(sorted(df_z_filtered.columns))

## Realtor Loading

In [8]:
realtor_read_schema = StructType([
        StructField('data', StructType([
            ## address cols
            StructField("location", StructType([
                StructField("address", StructType([
                    StructField("state_code", StringType(), True),
                    StructField("city", StringType(), True),
                    StructField("postal_code", StringType(), True),
                    StructField("coordinate", StructType([
                        StructField("lon", DoubleType(), True),
                        StructField("lat", DoubleType(), True)
                    ]), True),
                    StructField("line", StringType(), True),
                ]),True),
                StructField("county", StructType([
                    StructField("fips_code", StringType(), True)
                ]), True)
            ]), True),
            ## tax history cols
            StructField("tax_history", ArrayType(StructType([
                StructField("assessment", StructType([
                    StructField("total", StringType(), True)
                ]), True),
                StructField("tax", StringType(), True),
                StructField("year", IntegerType(), True)
            ])), True),
            ## description cols
            StructField("create_date", DateType(), True),
            StructField("status", StringType(), True),
            StructField("list_price", LongType(), True),
            StructField("description", StructType([
                StructField("cooling", BooleanType(), True),
                StructField("sqft", StringType(), True),
                StructField("beds", LongType(), True),
                StructField("heating", BooleanType(), True),
                StructField("lot_sqft", StringType(), True),
                StructField("units", IntegerType(), True),
                StructField("garage", BooleanType(), True),
                StructField("pool", BooleanType(), True),
                StructField("text", StringType(), True),
                StructField("year_built", LongType(), True),
                StructField("baths", LongType(), True),
                StructField("type", StringType(), True)
            ]), True),
            ## units
            StructField("units", ArrayType(StructType([
                StructField("photos", ArrayType(StructType([
                    StructField("type", StringType(), True),
                    StructField("href", StringType(), True)
                ])), True),
                StructField("list_price", IntegerType(), True),
                StructField("plan_id", StringType(), True),
                StructField("availability", StructType([
                    StructField("date", StringType(), True),
                    StructField("available", StringType(), True)
                ]), True),
                StructField("description", StructType([
                    StructField("beds_min", StringType(), True),
                    StructField("baths_min", StringType(), True),
                    StructField("name", StringType(), True),
                    StructField("baths_half", IntegerType(), True),
                    StructField("baths_full_calc", StringType(), True),
                    StructField("baths", IntegerType(), True),
                    StructField("sqft", IntegerType(), True),
                    StructField("baths_max", StringType(), True),
                    StructField("beds_max", StringType(), True),
                    StructField("beds", IntegerType(), True),
                    StructField("baths_partial_calc", IntegerType(), True)
                ]), True)
            ])), True),
        ]), True)
])

In [9]:
df_r_raw = spark.read \
                .option("mergeSchema", "true") \
                .option("multiLine", True) \
                .json(REALTOR_PATH, schema=realtor_read_schema)

In [10]:
df_r_single = df_r_raw.where(col('data').units.isNull()) \
                .select(
                        ## address cols
                        col('data').location.address.state_code.alias('state'), \
                        col('data').location.county.fips_code.alias('countyFIPS'), \
                        col('data').location.address.city.alias('city'), \
                        col('data').location.address.line.alias('streetAddress'), \
                        col('data').location.address.postal_code.alias('zipcode'), \
                        col('data').location.address.coordinate.lat.alias('latitude'), \
                        col('data').location.address.coordinate.lon.alias('longitude'), \
                        ## tax cols
                        col('data').tax_history.alias('taxHistory'), \
                        ## description
                        col('data').description.type.alias('homeType'), \
                        col('data').status.alias('homeStatus'), \
                        col('data').description.text.alias('textDescription'), \
                        col('data').create_date.alias('datePosted'), \
                        col('data').description.year_built.alias('yearBuilt'), \
                        when(col('data').list_price != lit(0), col('data').list_price).otherwise(lit(None)).alias('price'), \
                        concat(col('data').description.sqft, lit(' sqft')).alias('livingArea'), \
                        concat(col('data').description.lot_sqft, lit(' sqft')).alias('lotSize'), \
                        ## utility
                        col('data').description.baths.alias('bathrooms'), \
                        col('data').description.beds.alias('bedrooms'), \
                        coalesce(col('data').description.garage, lit(False)).alias('hasGarage'), \
                        coalesce(col('data').description.cooling, lit(False)).alias('hasCooling'), \
                        coalesce(col('data').description.heating, lit(False)).alias('hasHeating'), \
                        coalesce(col('data').description.pool, lit(False)).alias('hasPrivatePool'), \
                       )
df_r_single = df_r_single.select(sorted(df_r_single.columns))

In [11]:
df_r_agg = df_r_raw.where(col('data').units.isNotNull()) \
                .select(
                        row_number().over(Window.orderBy(lit(0))).alias('tempID'), \
                        ## address cols
                        col('data').location.address.state_code.alias('state'), \
                        col('data').location.county.fips_code.alias('countyFIPS'), \
                        col('data').location.address.city.alias('city'), \
                        col('data').location.address.line.alias('streetAddress'), \
                        col('data').location.address.postal_code.alias('zipcode'), \
                        col('data').location.address.coordinate.lat.alias('latitude'), \
                        col('data').location.address.coordinate.lon.alias('longitude'), \
                        ## tax cols
                        col('data').tax_history.alias('taxHistory'), \
                        ## description
                        col('data').description.type.alias('homeType'), \
                        col('data').status.alias('homeStatus'), \
                        col('data').description.text.alias('textDescription'), \
                        col('data').create_date.alias('datePosted'), \
                        col('data').description.year_built.alias('yearBuilt'), \
                        # when(col('data').list_price != lit(0), col('data').list_price).otherwise(lit(None)).alias('price'), \
                        # concat(col('data').description.sqft, lit(' sqft')).alias('livingArea'), \
                        concat(col('data.description.lot_sqft'), lit(' sqft')).alias('lotSize'), \
                        ## utility
                        # col('data').description.baths.alias('bathrooms'), \
                        # col('data').description.beds.alias('bedrooms'), \
                        coalesce(col('data').description.garage, lit(False)).alias('hasGarage'), \
                        coalesce(col('data').description.cooling, lit(False)).alias('hasCooling'), \
                        coalesce(col('data').description.heating, lit(False)).alias('hasHeating'), \
                        coalesce(col('data').description.pool, lit(False)).alias('hasPrivatePool'), \
                        ## units
                        col('data').units.alias('units') \
                       )

df_r_group = df_r_agg.withColumn('unitRecord', explode('units')) \
                    .drop('units')
df_r_group = df_r_group.select('tempId', 'unitRecord') \
            .groupBy('tempId') \
            .agg(
                sum(coalesce(col('unitRecord').description.beds, lit(0))).alias('bedrooms'),
                sum(coalesce(col('unitRecord').description.baths, lit(0))).alias('bathrooms'),
                sum(coalesce(col('unitRecord').list_price, lit(0))).alias('price'),
                concat(sum(coalesce(col('unitRecord').description.sqft, lit(0))), lit(' sqft')).alias('livingArea')
                )
df_r_agg = df_r_agg.join(df_r_group, on='tempID', how='left').drop('tempID', 'units')
df_r_agg = df_r_agg.select(sorted(df_r_agg.columns))

In [12]:
df_r_filtered = df_r_single.union(df_r_agg)
df_r_filtered = df_r_filtered.withColumn('bedrooms', col('bedrooms').cast(IntegerType())) \
                        .withColumn('bathrooms', col('bathrooms').cast(IntegerType()))
df_r_filtered = df_r_filtered.select(sorted(df_r_agg.columns))

## Combine and Deduplicate

In [13]:
r_cols = df_r_filtered.columns
z_cols = df_z_filtered.columns

df_r_join = df_r_filtered.withColumn('latitude_approx', round(col('latitude'), HAVERSINE_PRECISION)) \
                        .withColumn('longitude_approx', round(col('longitude'), HAVERSINE_PRECISION))
df_z_join = df_z_filtered.withColumn('latitude_approx', round(col('latitude'), HAVERSINE_PRECISION)) \
                        .withColumn('longitude_approx', round(col('longitude'), HAVERSINE_PRECISION))

for element in list(z_cols):
    df_z_join = df_z_join.withColumnRenamed(element, element + '_2')

In [16]:
zip_fips_df = pd.read_excel(ZIP_FIPS_PATH, dtype={'ZIP': str, 'COUNTY': str})
zip_to_fips = dict(zip(zip_fips_df['ZIP'], zip_fips_df['COUNTY']))

# Function to get the FIPS code for a given ZIP code
@pandas_udf(StringType())
def get_fips_code(zip_codes: pd.Series) -> pd.Series:
    return zip_codes.map(zip_to_fips).astype(str)


In [17]:
df_combined = df_r_join.join(df_z_join, \
                                  how='outer', \
                                  on=['latitude_approx', 'longitude_approx'])
df_combined = df_combined.withColumn("taxHistory_2", transform(
    col("taxHistory_2"),
    lambda x: struct(
        struct(x["value"].cast("string").alias("total")).alias("assessment"),
        x["taxPaid"].cast("string").alias("tax"),
        year(from_unixtime(x["time"] / 1000)).cast("integer").alias("year")
    )
))
df_combined = df_combined.select(
                                coalesce('bathrooms', 'bathrooms_2', lit(0)).alias('bathrooms'),
                                coalesce('bedrooms', 'bedrooms_2', lit(0)).alias('bedrooms'),
                                coalesce('city', 'city_2', lit(None)).alias('city'),
                                coalesce('countyFIPS', 'countyFIPS_2', lit(None)).alias('countyFIPS'),
                                coalesce('hasCooling', 'hasCooling_2', lit(False)).alias('hasCooling'),
                                coalesce('hasGarage', 'hasGarage_2', lit(False)).alias('hasGarage'),
                                coalesce('hasHeating', 'hasHeating_2', lit(False)).alias('hasHeating'),
                                coalesce('hasPrivatePool', 'hasPrivatePool_2', lit(False)).alias('hasPrivatePool'),
                                coalesce('homeStatus', 'homeStatus_2', lit('other')).alias('homeStatus'),
                                coalesce('homeType', 'homeType_2', lit('other')).alias('homeType'),
                                coalesce('latitude', 'latitude_2', lit(None)).alias('latitude'),                             
                                coalesce('livingArea', 'livingArea_2', lit(None)).alias('livingArea'),   
                                coalesce('longitude', 'longitude_2', lit(None)).alias('longitude'),
                                coalesce('lotSize', 'lotSize_2', lit(None)).alias('lotSize'),
                                coalesce('price', 'price_2', lit(None)).alias('price'),
                                coalesce('state', 'state_2', lit(None)).alias('state'),
                                coalesce('streetAddress', 'streetAddress_2', lit(None)).alias('streetAddress'),
                                coalesce('textDescription', 'textDescription_2', lit(None)).alias('textDescription'),
                                coalesce('yearBuilt', 'yearBuilt_2', lit(None)).alias('yearBuilt'),
                                coalesce('zipcode', 'zipcode_2', lit(None)).alias('zipcode'),
                                when(col('taxHistory').isNotNull(), col('taxHistory')).otherwise(col('taxHistory_2')).alias('taxHistory')
                                ) \
                        .withColumn('homeStatus', lower(col('homeStatus'))) \
                        .withColumn('homeType', lower(col('homeType'))) \
                        .withColumn('homeStatus', when(
                            (col('homeStatus') == lit('for_rent')) |
                            (col('homeStatus') == lit('for_sale')) |
                            (col('homeStatus') == lit('sold')),
                            col('homeStatus')
                        ).otherwise(lit('other'))) \
                        .withColumn('homeType', when(
                            (col('homeType') == lit('apartment')) |
                            (col('homeType') == lit('multi_family')) |
                            (col('homeType') == lit('single_family')) |
                            (col('homeType') == lit('house')) |
                            (col('homeType') == lit('farm')) |
                            (col('homeType') == lit('mobile')) |
                            (col('homeType') == lit('land')) |
                            (col('homeType') == lit('duplex_triplex')) |
                            (col('homeType') == lit('condo')) |
                            (col('homeType') == lit('lots/land')) |
                            (col('homeType') == lit('manufactured')),
                            col('homeType')
                        ).otherwise(lit('other'))) \
                        .withColumn('propertyId', row_number().over(Window.orderBy(lit(0)))) \
                        .withColumn('countyFIPS', when(col('countyFIPS').isNotNull(), col('countyFIPS')).otherwise(get_fips_code(col('zipcode'))))

## Create Fact & Subdimensional & Main Dimensional Tables

In [18]:
address_cols = ['state', 'countyFIPS', 'city', 'streetAddress', 'zipcode', 'latitude', 'longitude']
description_cols = ['homeType', 'homeStatus', 'textDescription', 'yearBuilt', 'price', 'livingArea', 'lotSize']
utility_cols = ['bathrooms', 'bedrooms', 'hasGarage', 'hasCooling', 'hasHeating', 'hasPrivatePool']

In [19]:
### Sub dimension address
df_address = df_combined.select(address_cols).distinct() \
                            .withColumn('addressId', row_number().over(Window.orderBy(lit(0)))) \

### Sub dimension description            
df_description = df_combined.select(description_cols).distinct() \
                            .withColumn('descriptionId', row_number().over(Window.orderBy(lit(0)))) \

### Sub dimension utility
df_utility = df_combined.select(utility_cols).distinct() \
                            .withColumn('utilityId',row_number().over(Window.orderBy(lit(0)))) 

In [20]:
### combined condition of eqNullSafe for setting the main dimensional table
combined_cond_address = lit(True)
for element in address_cols:
    combined_cond_address &= (df_combined[element].eqNullSafe(df_address[element]))

combined_cond_description = lit(True)
for element in description_cols:
    combined_cond_description &= (df_combined[element].eqNullSafe(df_description[element]))

combined_cond_utility = lit(True)
for element in utility_cols:
    combined_cond_utility &= (df_combined[element].eqNullSafe(df_utility[element]))

In [21]:
df_main = df_combined.join(df_address, on=combined_cond_address, how='left') \
                    .join(df_description, on=combined_cond_description, how='left') \
                    .join(df_utility, on=combined_cond_utility, how='left') \
                    .select('propertyID', 'addressId', 'descriptionID', 'utilityID')

In [22]:
df_tax_exploded = df_combined.select('propertyId', 'taxHistory') \
                                .withColumn('taxRecords', explode(col('taxHistory'))) \
                                .drop('taxHistory')
df_taxHistory = df_tax_exploded.select(col('propertyId'),
                                           col('taxRecords').year.alias('taxYear'),
                                           col('taxRecords').tax.cast(LongType()).alias('taxPaid'),
                                           col('taxRecords').assessment.total.cast(LongType()).alias('value')
                                          ).distinct() \
                                    .withColumn('taxPaid', coalesce('taxPaid', lit(0))) \
                                    .orderBy(asc('propertyID'), desc('taxYear')) \
                                    .withColumn('taxHistoryId', row_number().over(Window.orderBy(lit(0))))

In [23]:
df_main.show()

+----------+---------+-------------+---------+
|propertyID|addressId|descriptionID|utilityID|
+----------+---------+-------------+---------+
|         1|        4|            5|        1|
|         2|       13|           13|        5|
|         3|        2|           10|        9|
|         4|       12|           14|        8|
|         5|        1|           16|        7|
|         6|        8|            3|       10|
|         7|        9|            6|        3|
|         8|       11|            7|       13|
|         9|       15|            9|        2|
|        10|       14|           11|       11|
|        11|       17|            8|       10|
|        12|        7|            1|        4|
|        13|        3|            2|       15|
|        14|        6|           17|       12|
|        15|       10|            4|       14|
|        16|        5|           15|        6|
|        17|       16|           12|        2|
+----------+---------+-------------+---------+



In [24]:
df_address.show()

+-----+----------+------+--------------------+-------+---------+----------+---------+
|state|countyFIPS|  city|       streetAddress|zipcode| latitude| longitude|addressId|
+-----+----------+------+--------------------+-------+---------+----------+---------+
|   TX|     48453|Austin|8715 W Highway 71...|  78735|30.250511| -97.89649|        1|
|   TX|     48453|Austin|6508 Steep Cactus...|  78735|30.244486| -97.87705|        2|
|   TX|     48453|Austin|2601 Scofield Rid...|  78727| 30.42851|-97.697044|        3|
|   TX|     48453|Austin|   7603 Woodstone Cv|  78749|30.206296|-97.835752|        4|
|   TX|     48491|Austin| 10800 Lakeline Blvd|  78717|30.482453| -97.79353|        5|
|   TX|     48453|Austin|2601 Scofield Rid...|  78727|30.430732|-97.697007|        6|
|   TX|     48453|Austin|9906 Hundred Oaks...|  78750|30.421557|-97.800574|        7|
|   TX|     48453|Austin|403 E 32nd St Unit B|  78705|30.294802| -97.73362|        8|
|   TX|     48453|Austin|  6505 Cat Creek Trl|  78731|

In [25]:
df_description.show()

+--------------+----------+--------------------+---------+---------+-----------+-----------+-------------+
|      homeType|homeStatus|     textDescription|yearBuilt|    price| livingArea|    lotSize|descriptionId|
+--------------+----------+--------------------+---------+---------+-----------+-----------+-------------+
|duplex_triplex|  for_rent|Rare find of a 3 ...|     1983|     1725|  1914 sqft| 25700 sqft|            1|
|     apartment|     other|Great location: c...|     1999|117594538|   711 sqft|26.37 Acres|            2|
| single_family|  for_rent|Available NOW! LE...|     1934|     1600|   630 sqft|  7492 sqft|            3|
|     apartment|  for_rent|Better-than-new a...|     2015|     7925|  4344 sqft|16.96 Acres|            4|
|  multi_family|      sold|This is a 2070 sq...|     1983|   542231|  2070 sqft|  9091 sqft|            5|
|duplex_triplex|  for_rent|West side duplex ...|     1983|     1995|  1178 sqft| 17454 sqft|            6|
|     apartment|  for_rent|Welcome to

In [26]:
df_utility.show()

+---------+--------+---------+----------+----------+--------------+---------+
|bathrooms|bedrooms|hasGarage|hasCooling|hasHeating|hasPrivatePool|utilityId|
+---------+--------+---------+----------+----------+--------------+---------+
|        2|       0|    false|     false|     false|         false|        1|
|        0|       0|    false|     false|     false|         false|        2|
|        2|       2|    false|     false|     false|         false|        3|
|        2|       3|    false|     false|     false|         false|        4|
|        0|       9|    false|     false|     false|         false|        5|
|       11|      11|    false|     false|     false|         false|        6|
|        2|       2|    false|      true|      true|         false|        7|
|        1|       1|    false|     false|     false|         false|        8|
|        0|       3|    false|      true|     false|         false|        9|
|        1|       2|    false|     false|     false|         fal

In [27]:
df_taxHistory.show()

+----------+-------+-------+------+------------+
|propertyId|taxYear|taxPaid| value|taxHistoryId|
+----------+-------+-------+------+------------+
|         1|   2023|  11392|456311|           1|
|         1|   2022|  11671|377301|           2|
|         1|   2021|   8401|316422|           3|
|         1|   2020|   6786|316422|           4|
|         1|   2019|   6950|330851|           5|
|         1|   2018|   7324|205371|           6|
|         1|   2017|   4580|205371|           7|
|         1|   2016|   4715|227736|           8|
|         1|   2014|   5060|190834|           9|
|         1|   2014|   4889|205442|          10|
|         1|   2013|   4616|190834|          11|
|         1|   2012|   4546|190834|          12|
|         1|   2011|   4421|190834|          13|
|         1|   2010|   4210|190834|          14|
|         1|   2009|   4157|185787|          15|
|         1|   2008|   4000|184946|          16|
|         7|   2023|  19367|946104|          17|
|         7|   2022|

## Save as Excel Files

While in real cases an insert or upsert transaction to the data warehouse should be operated, in this case study, it's more convenient to save as excel files.

In [29]:
with pd.ExcelWriter(OUTPUT_PATH) as writer:  
    df_taxHistory.toPandas().to_excel(writer, sheet_name='factTaxHistory')
    df_main.toPandas().to_excel(writer, sheet_name='dimProperty')
    df_address.toPandas().to_excel(writer, sheet_name='dimAddress')
    df_description.toPandas().to_excel(writer, sheet_name='dimDescription')
    df_utility.toPandas().to_excel(writer, sheet_name='dimUtility')