In [0]:
%sql

--DESCRIBE CATALOG workspace;
--show tables in default  like  'transactions';

--select * from information_schema.tables where table_name like 'transactions'

--DESCRIBE EXTENDED workspace.default.transactions;

/*
select
input_file_name,
*
from transactions
limit 10;
*/

--DESCRIBE DETAIL transactions;

--SHOW PARTITIONS transactions;

--LIST '/user/hive/workspace';



In [0]:
#dbutils.fs.ls("/user/hive/warehouse/your_table_name.db/your_table_name")
#dbutils.fs.ls("/user/hive/workspace/default")

#ls "/user/hive/workspace/default"

In [0]:
%sh

databricks -v
pwd
ls
databricks catalogs list

In [0]:
from pyspark.sql.functions import current_timestamp, lit, year, month, dayofmonth, round, rand, col, from_unixtime
from datetime import datetime
from pyspark.sql.types import StringType, TimestampType, BooleanType, StructType, StructField, DoubleType, LongType


class hp:
    catalogue = "workspace"
    database = "default"
    display_target = "development"

    def setup(self, catalogue = "workspace", database = "default", display_target = "development"):
        s = None
        try:
            s = dbutils.widgets.get('catalogue_bronze_name')
        except:
            print('No catalogue_bronze_name parameter was found')
            pass
        if s is not None:
            self.catalogue = s
        s = None
        try:
            s = dbutils.widgets.get('database_bronze_name')
        except:
            print('No database_bronze_name parameter was found')
            pass
        if s is not None:
            self.database = s
        s = None
        try:
            s = dbutils.widgets.get('display_target')
        except:
            print('No display_target parameter was found')
            pass
        if s is not None:
            self.display_target = s
        s = None


        if catalogue is not None:
            self.catalogue = catalogue
        if database is not None:
            self.database = database
        if display_target is not None:
            self.display_target = display_target


        spark.sql(f'USE CATALOG {self.catalogue}')
        spark.sql(f'USE DATABASE {self.database}')


        #spark.sql("DROP VARIABLE IF EXISTS catalogue_bronze_name")
        #spark.sql("DROP VARIABLE IF EXISTS database_bronze_name")
        #spark.sql("DROP VARIABLE IF EXISTS display_target")
        #spark.sql("DECLARE VARIABLE catalogue_bronze_name STRING")
        #spark.sql("DECLARE VARIABLE database_bronze_name STRING")
        #spark.sql("DECLARE VARIABLE display_target STRING")
        
        spark.sql("drop temporary variable if exists catalogue_bronze_name;")
        spark.sql("declare variable catalogue_bronze_name string;")
        spark.sql(f"set variable catalogue_bronze_name='{self.catalogue}';")          

        spark.sql("drop temporary variable if exists database_bronze_name;")
        spark.sql("declare variable database_bronze_name string;")
        spark.sql(f"set variable database_bronze_name='{self.database}';")          

        spark.sql("drop temporary variable if exists display_target;")
        spark.sql("declare variable display_target string;")
        spark.sql(f"set variable display_target='{self.display_target}';")          


    def __init__(self, catalogue = None, database = None, display_target = None):
        self.setup(catalogue = catalogue, database = database, display_target = display_target)

        pass

    def add_standard_columns(self, df,createdBy=None,modifiedBy=None):
        df = df.withColumn('timestamp', current_timestamp())

        if modifiedBy is not None:
            df = df.withColumn('modifiedOn', current_timestamp().cast(TimestampType()))
            df = df.withColumn('modifiedBy', lit(modifiedBy).cast(StringType()))
        else:
            df = df.withColumn('modifiedOn', lit(None).cast(TimestampType()))
            df = df.withColumn('modifiedBy', lit(None).cast(StringType()))
        if createdBy is not None:
            df = df.withColumn('createdOn', current_timestamp())
            df = df.withColumn('createdBy', lit(createdBy).cast (StringType()))
        else:
            df = df.withColumn('createdOn', lit(None).cast(TimestampType()))
            df = df.withColumn('createdBy', lit(None).cast (StringType()))
        
        df = df.withColumn('isCurrent', lit(True).cast(BooleanType()))

        return df
    
    def transactions_pt(self, df):
        df = (
          df
            .withColumn('year', year(df['time']))
            .withColumn('month', month(df['time']))
            .withColumn('customer_partition', df['customer_id']%10)
        )

        return df


chp = hp()
#chp.setup(catalogue="ops",database="etl")

# Example usage:
# transformer = DataFrameTransformer(some_df)
# transformed_df = transformer.add_timestamp_column()
# display(transformed_df)

In [0]:

schema = StructType([
    StructField('id', LongType(), False),
    StructField('amount', DoubleType(), False),
    StructField('country_id', LongType(), False),
    StructField('store_id', LongType(), False),
    StructField('customer_id', LongType(), False),
    StructField('time', TimestampType(), False)
])
empty_df = spark.createDataFrame([], schema=schema)

empty_df = chp.add_standard_columns(empty_df)

empty_df = chp.transactions_pt(empty_df)

#display(empty_df)

#(
#    empty_df.write.mode("overwrite")
#        .option("overwriteSchema", "true")
#        .partitionBy("year","store_id","customer_partition")
#        .saveAsTable(f"{chp.catalogue}.{chp.database}.transactions")
#)

In [0]:

#150000000000

i1=0
step=50000000
#imax=150000000000
imax=1
i=1

while i1<imax:
    i2=i1+step

    print("Starting iteration i=",i," i1=",i1,"i2=",i2,"imax=",imax,"step=",step, " current timestamp ",datetime.now())

    df = (
        spark
            .range(i1,i2,1)#,32)
            .select(
                'id',
                round(rand()*1000,2).alias('amount'),
                (col('id')%10).alias('country_id'),
                (col('id')%100).alias('store_id'),
                round(rand()*100000000,0).cast(LongType()).alias('customer_id'),
                from_unixtime(lit(1701692381+col('id'))).cast(TimestampType()).alias('time'),
        
            )
    )

    df1 = chp.add_standard_columns(df,'etl')
    df2 = chp.add_standard_columns(df,'etl','mml2')

    df1 = df1.union(df2)

    df1 = chp.transactions_pt(df1)

#display(df1)

    (
        df1.write.mode("append")
            .option("mergeSchema", "false")
            .partitionBy("year","store_id","customer_partition")
            .saveAsTable(f"{chp.catalogue}.{chp.database}.transactions")
    )

    print("Iteration i=", " is over",i," i1=",i1,"i2=",i2,"imax=",imax,"step=",step, " current timestamp ",datetime.now())

    i=i+1
    i1=i1+step


In [0]:
%sql
select count(*) as rccount from transactions