In [1]:
mainfile = """\
import pyspark 
import sys 
import shutil
from pyspark import SparkContext, SparkConf  
from pyspark.sql import SparkSession  
from delta import *  
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable
import click


import logging
logging.basicConfig(level="INFO")
logger = logging.getLogger(__name__) # __name__=docai
logger.info("This is an INFO message on the root logger.")


import configparser
config = configparser.ConfigParser()
config.read('config.cfg', encoding='utf-8-sig')

container_name       =  config['AZURE']['CONTAINER_NAME']
storage_account_name =  config['AZURE']['STORAGE_ACCOUNT_NAME']
account_access_key   =  config['AZURE']['ACCOUNT_ACCESS_KEY']


if __name__ == "__main__":

    try:   # Start Spark Session

        builder = SparkSession.builder \
            .appName("documentai") \
            .master("local[*]") \
            .config("spark.jars.packages", "io.delta:delta-core_2.12:1.1.0") \
            .config("spark.jars.packages", "org.apache.hadoop:hadoop-azure-datalake:3.1.1") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .config(f"spark.hadoop.fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net", "SharedKey")\
            .config(f"spark.hadoop.fs.azure.account.key.{storage_account_name}.dfs.core.windows.net",f"{account_access_key}")

        spark = configure_spark_with_delta_pip(builder).getOrCreate()

    except Exception as error:

        logger.info("Spark builder connection prompted out due to : %s", error)


        docai_schema =  StructType([StructField("S.no", StringType()),
                        StructField("Item 1", StringType()),
                        StructField("Item 1A", StringType()),
                        StructField("Item 2", StringType()),
                        StructField("Item 5", StringType()),
                        StructField("Item 6", StringType()),
                        StructField("Item 7", StringType()),  
                        StructField("Item 7a", StringType()),
                        StructField("Item 8", StringType()),
                        StructField("DIRECTORS, EXECUTIVE OFFICERS AND CORPORATE GOVERNANCE (References only)", StringType()),
                        StructField("CERTAIN RELATIONSHIPS AND RELATED TRANSACTIONS, AND DIRECTOR INDEPENDENCE (References only)", StringType()),
                        StructField("Item 15", StringType())
                        ])


        @click.command()
        @click.option('--raw_data_bucket', prompt='Azure raw bucket-name', help='Name of Raw data Bucket.')
        @click.option('--start_date', prompt='Enter Start date', help='The start date to get data.')
        @click.option('--end_date', prompt='Enter end date', help='The end date to get data.')
        @click.option('--delta_table', prompt='Enter delta lake table name', help='The delta table name.')


        def session(raw_data_bucket, start_date, end_date, delta_table):
        
            click.echo(f"This is the Azure bucket to be used {raw_data_bucket}!")
            click.echo(f"This is the start date {start_date}!")
            click.echo(f"This is the end date {end_date}!")
            click.echo(f"This is the delta lake table {delta_table}!")

            
            AZURE_RAW_DATA   = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/{raw_data_bucket}/*.txt"
            TABLE_PATH = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/{delta_table}"


            # clear previous run's delta-table
            shutil.rmtree("AZURE_RAW_DATA", ignore_errors=True)

            # Read file from Azure blob storage
            df_docai = spark \
                        .read \
                        .option("inferSchema", "true") \
                        .option("header", "true") \
                        .text(AZURE_RAW_DATA, 
                                    lineSep=",", 
                                        wholetext=True,
                                            header=False, 
                                                schema=docai_schema)
            df_docai.show(5)

            # Convert to Delta
            deltatable = DeltaTable.convertToDelta(spark, "parquet.`docai-table`")


            # write to delta table
            deltatable.write.format("delta") \
                            .mode("overwrite") \
                            .save(TABLE_PATH)

            # Read delta file
            df_docai = spark.read.format("delta").load(TABLE_PATH,
                                            header=True, 
                                            schema=docai_schema)
            df_docai.show(10)


            # Read table with DeltaTable
            deltaTable = DeltaTable.forPath(spark, "")
            deltaTable.toDF().show()


    spark.stop()
session()
"""

In [2]:
# Write mainfile

mainpy_path = './main.py'
with open(mainpy_path,'w') as f:
    f.write(mainfile)

In [3]:
dockerfile = """\
FROM gcr.io/spark-operator/spark-py:v3.1.1

# switch to user root so we can add additional jars and configuration files.
# USER root:root
USER root

WORKDIR /app

RUN apt-get install python3-pip

COPY requirements.txt /app/
RUN pip install -r requirements.txt

COPY spark_job.py  /app

ENTRYPOINT [ "/opt/entrypoint.sh" ]
"""

In [4]:
docker_path = './Dockerfile'
with open(docker_path,'w') as f:
    f.write(dockerfile)

In [6]:
requirements = """\
pyspark
delta-spark
click
# wget
"""

In [7]:
text_path = './requirements.txt'
with open(text_path,'w') as f:
    f.write(requirements)

In [10]:
# Build docker image

!docker build -t sparkrun .

The system cannot find the path specified.
error during connect: This error may indicate that the docker daemon is not running.: Post "http://%2F%2F.%2Fpipe%2Fdocker_engine/v1.24/build?buildargs=%7B%7D&cachefrom=%5B%5D&cgroupparent=&cpuperiod=0&cpuquota=0&cpusetcpus=&cpusetmems=&cpushares=0&dockerfile=Dockerfile&labels=%7B%7D&memory=0&memswap=0&networkmode=default&rm=1&shmsize=0&t=sparkrun&target=&ulimits=null&version=1": open //./pipe/docker_engine: The system cannot find the file specified.


In [None]:
!docker images

In [None]:
resourceGroupName = 'BMA-nlp-infra' 
location ='canadacentral'
acrName = 'infranlpacr'
tenant_id = 'XXXX'
subscription_id = 'XXXX'

In [None]:
!echo $resourceGroupName
!echo $location
!echo $acrNameb

In [None]:
!az login --tenant tenant_id --subscription subscription_id

In [None]:
!az acr login --name $acrName

In [None]:
!docker tag sparkrun $acrName".azurecr.io/sparkrun:v1"

In [None]:
!docker images

In [None]:
!docker push $acrName".azurecr.io/sparkrun:v1"

In [None]:
!az acr repository list --name $acrName --output table