PRE REQUISITE
- paramiko
- python-gnupg
- com.microsoft.azure:spark-mssql-connector_2.12:1.2.0

In [0]:
# Input from workflow
dbutils.widgets.text("book","default")
dbutils.widgets.text("previous_month","1")

# Define file paths and database parameters
book = dbutils.widgets.get("book")
# Previous month is used to calculate the AS OF date, default is 1 for the month prior.
# Example:
# previous_month = 1 and the current month now is May, so AS OF date is April 30th. 
# previous_month = 2, current month is May, so AS OF date is March 31st.
# previous_month = 5, current month is May, so AS OF date is December 31s (previous year).
previous_month = int(dbutils.widgets.get("previous_month"))

In [0]:
import paramiko
import os
import datetime
import calendar
import fnmatch  # For pattern matching
import tempfile
import subprocess
import gnupg

# This is the default scope name across environments
secret_scope = "nip-sas-scope"

# SFTP server credentials
sftp_host = "sftp.nipgroup.com"
sftp_port = 22
sftp_username = dbutils.secrets.get(scope = secret_scope, key = "nip-sftp-username")
sftp_password = dbutils.secrets.get(scope = secret_scope, key = "nip-sftp-password")

# Remote directory path
remote_directory = "/incoming/"

# Databricks file system (DBFS) path
mnt_sftp_path = "/dbfs/mnt/sftp/"

# Chunk size for downloading (1MB)
CHUNK_SIZE = 1024 * 1024

# Filename patterns to match
patterns = ["CLAIM","EXPOSURE","ORGANIZATION","POLICY","PREMIUM"]

# Get current month and year
current_date = datetime.datetime.now()
current_month = current_date.month - previous_month # Deduct 1 month as data pushed covers the previous month
month_name = calendar.month_name[current_month]
current_year = current_date.year
last_day = calendar.monthrange(current_year, current_month)[1]
date_suffix = f"{current_year:04d}{current_month:02d}{last_day:02d}"

# Connect to SFTP
try:
    # Create an SFTP client
    transport = paramiko.Transport((sftp_host, sftp_port))
    transport.connect(username=sftp_username, password=sftp_password)
    sftp = paramiko.SFTPClient.from_transport(transport)
    transport.set_keepalive(30)  # Keep connection alive every 30 seconds

    # List files in the remote directory
    files = sftp.listdir(remote_directory)
    
    # Get file metadata and filter files from the current month and year
    filtered_files = []
    for file in files:
        # Determine the file pattern based on the book parameter
        if book == "jif":
            file_pattern = "I2I_JIF_{pattern}_NIP_{date_suffix}*"
        elif book == "program":
            file_pattern = "I2I_Programs_{pattern}_NIP_{date_suffix}*"
        else:
            file_pattern = "I2I_{pattern}_NIP_{date_suffix}*"
        
        # Check if filename matches any pattern
        if any(fnmatch.fnmatch(file, file_pattern.format(pattern=pattern, date_suffix=date_suffix)) for pattern in patterns):
            filtered_files.append(file)
    
    # Check if any files were found
    if filtered_files.__len__() == 0:
        raise Exception("No files found for the current month and year.")

    # Get file metadata and filter the file sizes
    files_to_download = [
        (file, sftp.stat(os.path.join(remote_directory, file)).st_size) 
        for file in filtered_files
    ]

    # Sort files by size (ascending)
    files_to_download.sort(key=lambda x: x[1])

    # Download filtered files to DBFS temp directory
    sftp_files_path = [] # Store the downloaded encrypted file paths for later use
    for file_obj in files_to_download:
        file = file_obj[0] # File name
        sftp_path = os.path.join(mnt_sftp_path, file) # Blob container path, mounted
        sftp_files_path.append(sftp_path)
        remote_file_path = os.path.join(remote_directory, file) # Remote file path

        # Download the file to DBFS temp directory with the same name as the remote file
        with sftp.open(remote_file_path, "rb") as remote_file, open(sftp_path, "wb") as downloadable_file:
            while True:
                data = remote_file.read(CHUNK_SIZE)
                if not data:
                    break
                downloadable_file.write(data)

except Exception as e:
    raise Exception(f"Error connecting to SFTP server: {e}")
finally:
    if 'sftp' in locals():
        sftp.close()
    if 'transport' in locals():
        transport.close()

In [0]:
# PGP secret
pgp_secret = dbutils.secrets.get(scope=secret_scope, key="nip-pgp-secret")
# Path to encrypted files and decryption output
mnt_sas_path = "/dbfs/mnt/sas/"

# Get the pgp secret value
pgp_len = len(pgp_secret)
pgp_key = pgp_secret[:(pgp_len - 4)]
begin = "-----BEGIN PGP PRIVATE KEY BLOCK-----"
end = "-----END PGP PRIVATE KEY BLOCK-----"
formatted = pgp_key.replace(begin, "").replace(end, "").replace(" ", "\n")
pgp_secret = f"{begin}{formatted}{end}"

with tempfile.TemporaryDirectory() as gnupghome:
    os.environ["GNUPGHOME"] = gnupghome
    gpg = gnupg.GPG()

    # Import the PGP secret
    import_result = gpg.import_keys(pgp_secret)
    keyid = import_result.fingerprints[0]
    gpg.trust_keys(keyid, "TRUST_ULTIMATE")

    # List imported keys, do a check to ensure the key is present
    imported_keys = gpg.list_keys(secret=True)
    if not imported_keys:
        dbutils.notebook.exit("No secret key found. Ensure the correct PGP private key is imported.")

    # Use variable where the stored downloaded encrypted file path in the sftp mount - 'stfp_files_path'
    for sftp_file in sftp_files_path: # Use the sftp mounted path variable from previous notebook
        sas_file = sftp_file.replace(mnt_sftp_path, mnt_sas_path).replace(".gpg", "").replace(".pgp", "")
        
        gpg_command = ['gpg', '--output', sas_file, '--decrypt', sftp_file]

        try:
            subprocess.run(gpg_command, check=True, capture_output=True, text=True)
        except subprocess.CalledProcessError as e:
            print(f"Decryption failed for file: {sftp_file}.")