In [0]:
pip install faker

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting faker
  Downloading Faker-23.1.0-py3-none-any.whl (1.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.7/1.7 MB 13.5 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-23.1.0
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from faker import Faker
import random
from datetime import datetime, timedelta, date

snapshot_date_key = getArgument('snapshot_date_key')
trunc = getArgument('trunc')

# Create a Faker instance
fake = Faker()
 
# Set seed for reproducibility
random.seed(42)
 
# Generate synthetic data
data = []

# Define probabilities for Division_Code_Key values
division_code_probabilities = [0.95, 0.0003, 0.0123, 0.0146, 0.0228]

# Get data from dimensions
consumers = spark.read.table('catalog_westeurope_dpn01_de_dev.db_dtc_core_data_prep_cbrep.cfdm_consumer_dimension').select('Consumer_Key','Division_Code_Key').collect()
#divisions = spark.read.table('catalog_westeurope_dpn01_de_dev.db_dtc_core_data_prep_cbrep.cfdm_division_dimension').select('Division_Code_Key').collect()


# Define the number of rows for your synthetic data
num_rows = 20000

for _ in range(num_rows):
    #division_code_key = random.choices(divisions)[0].Division_Code_Key
    consumer = random.choices(consumers)[0]
    consumer_key = consumer.Consumer_Key
    division_code_key = consumer.Division_Code_Key
    created = fake.date_time_between(start_date='-3y', end_date='now')
    modified = fake.date_time_between(start_date=created, end_date='now')
    total_trans = fake.random_int(min=0, max=5000)
    distinct_trans = fake.randomize_nb_elements(number=200, min=1, max=500)
    data.append((
        _,                                                                      # Consumer_Transaction_Fact_Key
        10,                                                                     # Region_Code_Key
        fake.random_int(min=100000, max=999999),                                # Affiliate_Code_Key
        1,                                                                      # Market_Code_Key
        division_code_key,                                                      # Division_Code_Key
        1,                                                                      # Currency_Code_Key
        consumer_key,                                                           # Consumer_Key
        fake.random_int(min=1, max=4),                                          # Consumer_Transaction_Channel_Code_Key
        fake.random_int(min=1, max=4),                                          # Product_Application_Hierarchy_Key
        snapshot_date_key, #date.today().strftime('%Y%m%d'),                                                               # Snapshot_Date_Key
        fake.random_int(min=2020, max=2023),                                    # Consumer_Transaction_Calendar_Year_Number
        fake.random_int(min=1, max=12),                                         # Consumer_Transaction_Calendar_Month_Number
        fake.random_int(min=0, max=500),                                        # Inactive_Consumer_Saleable_Transaction_Count
        fake.random_int(min=0, max=500),                                        # Active_Consumer_Saleable_Transaction_Count
        fake.pydecimal(left_digits=12, right_digits=2, positive = True, min_value=1, max_value=400),        # Inactive_Consumer_Monetary_Saleable_Transaction_Amount
        fake.pydecimal(left_digits=12, right_digits=2, positive = True, min_value=1, max_value=800),        # Active_Consumer_Monetary_Saleable_Transaction_Amount
        fake.pydecimal(left_digits=12, right_digits=2, positive = True, min_value=1, max_value=1000),        # Consumer_Total_Transaction_Amount
        fake.random_int(min=0, max=500),                                       # Consumer_Total_Transaction_Count
        fake.random_int(min=1, max=300),                                       # Consumer_Total_Item_Count
        fake.random_int(min=1, max=200),                                        # Consumer_Distinct_Item_Count
        fake.pydecimal(left_digits=12, right_digits=2, positive = True, min_value=1, max_value=900),        # Consumer_Lifetime_Total_Transaction_Amount
        fake.random_int(min=1, max=500),                                        # Consumer_Lifetime_Total_Transaction_Count
        fake.random_int(min=1, max=300),                                        # Consumer_Last12m_Total_Transaction_Count
        fake.random_int(min=1, max=300),                                        # Consumer_Last12m_Total_Item_Count
        fake.pydecimal(left_digits=12, right_digits=2, positive = True, min_value=1, max_value=2000),        # Consumer_Last12m_Total_Transaction_Amount
        created,                                                                # Consumer_Transaction_Fact_Record_Created_Timestamp
        modified                                                                # Consumer_Transaction_Fact_Record_Modified_Timestamp
    ))

# Create DataFrame schema
schema = [  
    "Consumer_Transaction_Fact_Key",
    "Region_Code_Key",
    "Affiliate_Code_Key",
    "Market_Code_Key",
    "Division_Code_Key",
    "Currency_Code_Key",
    "Consumer_Key",
    "Consumer_Transaction_Channel_Code_Key",
    "Product_Application_Hierarchy_Key",
    "Snapshot_Date_Key",
    "Consumer_Transaction_Calendar_Year_Number",
    "Consumer_Transaction_Calendar_Month_Number",
    "Inactive_Consumer_Saleable_Transaction_Count",
    "Active_Consumer_Saleable_Transaction_Count",
    "Inactive_Consumer_Monetary_Saleable_Transaction_Amount",
    "Active_Consumer_Monetary_Saleable_Transaction_Amount",
    "Consumer_Total_Transaction_Amount",
    "Consumer_Total_Transaction_Count",
    "Consumer_Total_Item_Count",
    "Consumer_Distinct_Item_Count",
    "Consumer_Lifetime_Total_Transaction_Amount",
    "Consumer_Lifetime_Total_Transaction_Count",
    "Consumer_Last12m_Total_Transaction_Count",
    "Consumer_Last12m_Total_Item_Count",
    "Consumer_Last12m_Total_Transaction_Amount",
    "Consumer_Transaction_Fact_Record_Created_Timestamp",
    "Consumer_Transaction_Fact_Record_Modified_Timestamp"
]
 
# Create PySpark DataFrame
df = spark.createDataFrame(data, schema=schema)
 
# Dropping duplicates
result_df = df.dropDuplicates()
'''
result_df = df.dropDuplicates(["Region_Code_Key",
    "Currency_Code_Key",
    "Market_Code_Key",
    "Division_Code_Key",
    "Consumer_Key"])
'''
# Show the result DataFrame
# result_df.count()
result_df.createOrReplaceTempView('dummy_data')
display(result_df)

Consumer_Transaction_Fact_Key,Region_Code_Key,Affiliate_Code_Key,Market_Code_Key,Division_Code_Key,Currency_Code_Key,Consumer_Key,Consumer_Transaction_Channel_Code_Key,Product_Application_Hierarchy_Key,Snapshot_Date_Key,Consumer_Transaction_Calendar_Year_Number,Consumer_Transaction_Calendar_Month_Number,Inactive_Consumer_Saleable_Transaction_Count,Active_Consumer_Saleable_Transaction_Count,Inactive_Consumer_Monetary_Saleable_Transaction_Amount,Active_Consumer_Monetary_Saleable_Transaction_Amount,Consumer_Total_Transaction_Amount,Consumer_Total_Transaction_Count,Consumer_Total_Item_Count,Consumer_Distinct_Item_Count,Consumer_Lifetime_Total_Transaction_Amount,Consumer_Lifetime_Total_Transaction_Count,Consumer_Last12m_Total_Transaction_Count,Consumer_Last12m_Total_Item_Count,Consumer_Last12m_Total_Transaction_Amount,Consumer_Transaction_Fact_Record_Created_Timestamp,Consumer_Transaction_Fact_Record_Modified_Timestamp
192,10,291088,1,17,1,5223115,4,4,20240212,2023,1,298,123,314.42,109.88,38.85,18,246,177,299.21,370,29,18,1167.75,2023-01-30T09:22:37.615582Z,2023-05-16T01:41:43.991444Z
212,10,409710,1,26,1,2369445,2,4,20240212,2020,6,15,64,288.46,581.14,236.27,164,259,155,62.64,196,54,7,1319.43,2022-05-16T13:15:00.200991Z,2023-10-25T21:50:33.676606Z
213,10,105083,1,21,1,9558557,3,2,20240212,2022,9,466,48,95.24,589.59,963.03,75,77,95,290.47,152,52,77,940.69,2021-11-10T07:14:33.653582Z,2023-06-06T07:55:42.159013Z
339,10,556866,1,12,1,8334376,4,2,20240212,2020,12,485,498,142.79,623.65,726.29,4,172,42,112.35,482,2,187,994.09,2023-01-08T00:39:24.113312Z,2023-12-25T21:52:22.713634Z
349,10,504804,1,22,1,7692818,4,1,20240212,2022,7,262,354,259.5,180.57,49.15,286,50,154,730.69,411,288,119,1610.89,2023-10-14T16:52:26.381215Z,2024-01-02T23:32:57.72506Z
354,10,898450,1,38,1,2256425,4,3,20240212,2023,10,235,121,365.79,101.84,842.6,94,1,109,485.77,392,91,261,1498.17,2023-07-24T21:00:23.829222Z,2024-01-08T06:48:51.790835Z
721,10,977425,1,36,1,5843920,1,1,20240212,2023,12,21,74,63.8,316.76,86.82,312,262,50,270.33,371,32,88,1610.65,2021-12-04T14:31:40.367621Z,2023-06-24T01:19:26.291317Z
801,10,823923,1,12,1,2629614,1,4,20240212,2022,9,126,171,192.29,411.88,209.49,95,222,79,732.8,389,72,97,1466.13,2024-02-05T14:55:19.080086Z,2024-02-12T09:08:39.837894Z
1000,10,624423,1,11,1,7341885,2,4,20240212,2020,4,187,136,307.24,206.69,11.53,103,192,47,562.34,299,117,115,175.11,2023-08-20T03:52:05.147307Z,2023-08-26T06:28:18.543545Z
437,10,585453,1,12,1,1855849,3,3,20240212,2023,5,216,200,253.3,628.33,67.6,355,173,60,74.01,51,174,81,870.1,2021-02-18T11:13:26.6522Z,2022-01-12T15:55:05.635058Z


In [0]:
%sql
use catalog catalog_westeurope_dpn01_de_dev

In [0]:
if trunc == 'True':
    spark.sql('truncate table catalog_westeurope_dpn01_de_dev.db_dtc_core_data_prep_cbrep.cfdm_consumer_transaction_fact;').show()


In [0]:
%sql
insert into catalog_westeurope_dpn01_de_dev.db_dtc_core_data_prep_cbrep.cfdm_consumer_transaction_fact 
select * from dummy_data;
--values (1,2,3,4,5,6,current_date(),current_date(),current_date())

num_affected_rows,num_inserted_rows
20000,20000


select * from catalog_westeurope_dpn01_de_dev.db_dtc_core_data_prep_cbrep.cfdm_consumer_transaction_fact --limit 10;