In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType, StringType
import requests
import time
from pyspark.sql.functions import when, col, isnull
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, current_timestamp, lit, col
from pyspark.sql import functions as F
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import *

StatementMeta(, 209bd953-8373-42d4-ae8d-8e27d8710ab3, 14, Finished, Available)

Bronze - Silver: SCD Type 2.

Those table below is not one of them. Just skip and load it to Silver

In [2]:
def get_lat_long(city, country, retries=3, delay=2):
    address = f"{city}, {country}"
    url = f"https://nominatim.openstreetmap.org/search?q={address}&format=json&user-agent=MyApplication"
    
    for attempt in range(retries):
        response = requests.get(url, headers={'User-Agent': 'MyApplication'})
        
        if response.status_code == 200:
            data = response.json()
            if data:  # Check if data is not empty
                latitude = float(data[0]["lat"])
                longitude = float(data[0]["lon"])
                return latitude, longitude
            else:
                return None, None  # No results found
        elif response.status_code == 403:
            print(f"403 Forbidden error encountered. Attempt {attempt + 1} of {retries}. Retrying in {delay} seconds...")
            time.sleep(delay)
        else:
            print(f"Error: {response.status_code}")
            return None, None
    
    print("Failed to retrieve coordinates after several attempts.")
    return None, None

StatementMeta(, 209bd953-8373-42d4-ae8d-8e27d8710ab3, 4, Finished, Available)

In [3]:
# Example usage
city = "Ho Chi Minh City"
country = "Vietnam"
latitude, longitude = get_lat_long(city, country)

if latitude is not None and longitude is not None:
    print(f"Latitude: {latitude}, Longitude: {longitude}")
else:
    print("Could not retrieve coordinates.")

StatementMeta(, 209bd953-8373-42d4-ae8d-8e27d8710ab3, 5, Finished, Available)

Latitude: 10.7763897, Longitude: 106.7011391


## **Customer Region**

In [4]:
customer_region = spark.sql("SELECT * FROM LTT_SilverLakehouse.customer_region")
# display(customer_region)
customer_region.printSchema()

StatementMeta(, 209bd953-8373-42d4-ae8d-8e27d8710ab3, 6, Finished, Available)

root
 |-- concat_customer_region: string (nullable = true)
 |-- customer_country: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_street: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)



In [5]:
display(customer_region.head(5))

StatementMeta(, 209bd953-8373-42d4-ae8d-8e27d8710ab3, 7, Finished, Available)

SynapseWidget(Synapse.DataFrame, 1e4b991c-cfb0-4649-b736-9fc1f7f2bf05)

In [16]:
customer_region_table = customer_region \
    .withColumn("customer_region_key", (F.monotonically_increasing_id() + 1).cast("int")) \
    .select("customer_region_key", "concat_customer_region", "customer_country", "customer_state", "customer_city", "customer_street", "longitude", "latitude")

StatementMeta(, 209bd953-8373-42d4-ae8d-8e27d8710ab3, 18, Finished, Available)

In [17]:
# Convert to store_table
store_table = customer_region_table.withColumnRenamed('customer_region_key', 'store_key') \
                                    .withColumnRenamed('customer_country', 'store_country') \
                                    .withColumnRenamed('customer_state', 'store_state') \
                                    .withColumnRenamed('customer_city', 'store_city') \
                                    .withColumnRenamed('customer_street', 'store_name')

StatementMeta(, 209bd953-8373-42d4-ae8d-8e27d8710ab3, 19, Finished, Available)

## **Destination Order**

In [7]:
destination = spark.sql("SELECT * FROM LTT_SilverLakehouse.desti_order")
destination.printSchema()

StatementMeta(, 209bd953-8373-42d4-ae8d-8e27d8710ab3, 9, Finished, Available)

root
 |-- concat_destination_address: string (nullable = true)
 |-- market: string (nullable = true)
 |-- order_region: string (nullable = true)
 |-- order_country: string (nullable = true)
 |-- order_state: string (nullable = true)
 |-- order_city: string (nullable = true)



In [8]:
# Define UDF to get latitude
def get_lat(city, country):
    lat, lon = get_lat_long(city, country)
    return lat

# Define UDF to get longitude
def get_lon(city, country):
    lat, lon = get_lat_long(city, country)
    return lon

# Register the UDFs
get_lat_udf = udf(get_lat, DoubleType())
get_lon_udf = udf(get_lon, DoubleType())

StatementMeta(, 209bd953-8373-42d4-ae8d-8e27d8710ab3, 10, Finished, Available)

In [9]:
destination = destination \
    .withColumn('latitude', get_lat_udf(destination['order_city'], destination['order_country'])) \
    .withColumn('longitude', get_lon_udf(destination['order_city'], destination['order_country']))

StatementMeta(, 209bd953-8373-42d4-ae8d-8e27d8710ab3, 11, Finished, Available)

In [10]:
display(destination.head(5))

StatementMeta(, 209bd953-8373-42d4-ae8d-8e27d8710ab3, 12, Finished, Available)

SynapseWidget(Synapse.DataFrame, 4050c832-4f43-4328-84e4-76b233f4c2cf)

In [14]:
destination = destination \
    .withColumn("desti_key", (F.monotonically_increasing_id()+1).cast("int")) \
    .withColumnRenamed("market", "desti_market") \
    .withColumnRenamed("order_region", "desti_region") \
    .withColumnRenamed("order_country", "desti_country") \
    .withColumnRenamed("order_state", "desti_state") \
    .withColumnRenamed("order_city", "desti_city") \
    .select(["desti_key", "concat_destination_address", "desti_city", "desti_state", "desti_country", "desti_region", "desti_market", "longitude", "latitude"])

StatementMeta(, 209bd953-8373-42d4-ae8d-8e27d8710ab3, 16, Finished, Available)

## **Load To Gold Lakehouse**

In [19]:
store_table.write.format('delta').mode('overwrite').saveAsTable('LTT_GoldLakehouse.dim_store')
destination.write.format('delta').mode('overwrite').saveAsTable('LTT_GoldLakehouse.dim_destination')

StatementMeta(, 209bd953-8373-42d4-ae8d-8e27d8710ab3, 21, Finished, Available)