PRE REQUISITE
- paramiko
- python-gnupg
- com.microsoft.azure:spark-mssql-connector_2.12:1.2.0

In [0]:
# Input from workflow
dbutils.widgets.text("book","default")

# Define file paths and database parameters
book = dbutils.widgets.get("book")

In [0]:
import paramiko
import os
import datetime
import calendar
import fnmatch  # For pattern matching
import tempfile
import subprocess
import gnupg

# SFTP server credentials
sftp_host = "sftp.nipgroup.com"
sftp_port = 22
sftp_username = dbutils.secrets.get(scope = "nip-scope-dev", key = "nip-sftp-dev-username")
sftp_password = dbutils.secrets.get(scope = "nip-scope-dev", key = "nip-sftp-dev-password")

# Remote directory path
remote_directory = "/incoming/"

# Databricks file system (DBFS) path
mnt_sftp_path = "/dbfs/mnt/sftp/"

# Chunk size for downloading (1MB)
CHUNK_SIZE = 1024 * 1024

# Filename patterns to match
patterns = ["CLAIM","EXPOSURE","ORGANIZATION","POLICY","PREMIUM"]

# Get current month and year
current_date = datetime.datetime.now()
current_month = current_date.month - 1 # Deduct 1 month as data pushed covers the previous month
month_name = calendar.month_name[current_month]
current_year = current_date.year
last_day = calendar.monthrange(current_year, current_month)[1]
date_suffix = f"{current_year:04d}{current_month:02d}{last_day:02d}"

# Connect to SFTP
try:
    # Create an SFTP client
    transport = paramiko.Transport((sftp_host, sftp_port))
    transport.connect(username=sftp_username, password=sftp_password)
    sftp = paramiko.SFTPClient.from_transport(transport)
    transport.set_keepalive(30)  # Keep connection alive every 30 seconds
    print("Successfully connected to SFTP server.\n")

    # List files in the remote directory
    files = sftp.listdir(remote_directory)
    
    # Get file metadata and filter files from the current month and year
    filtered_files = []
    for file in files:
        # Determine the file pattern based on the book parameter
        if book == "jif":
            file_pattern = "I2I_JIF_{pattern}_NIP_{date_suffix}*"
        elif book == "program":
            file_pattern = "I2I_Programs_{pattern}_NIP_{date_suffix}*"
        else:
            file_pattern = "I2I_{pattern}_NIP_{date_suffix}*"
        
        # Check if filename matches any pattern
        if any(fnmatch.fnmatch(file, file_pattern.format(pattern=pattern, date_suffix=date_suffix)) for pattern in patterns):
            filtered_files.append(file)
    
    # Check if any files were found
    if filtered_files.__len__() == 0:
        dbutils.notebook.exit("No files found for the current month and year.")

    # Get file metadata and filter the file sizes
    files_to_download = [
        (file, sftp.stat(os.path.join(remote_directory, file)).st_size) 
        for file in filtered_files
    ]

    # Sort files by size (ascending)
    files_to_download.sort(key=lambda x: x[1])

    # Download filtered files to DBFS temp directory
    sftp_files_path = [] # Store the downloaded encrypted file paths for later use
    for file_obj in files_to_download:
        file = file_obj[0] # File name
        sftp_path = os.path.join(mnt_sftp_path, file) # Blob container path, mounted
        sftp_files_path.append(sftp_path)
        remote_file_path = os.path.join(remote_directory, file) # Remote file path

        # Download the file to DBFS temp directory with the same name as the remote file
        with sftp.open(remote_file_path, "rb") as remote_file, open(sftp_path, "wb") as downloadable_file:
            while True:
                data = remote_file.read(CHUNK_SIZE)
                if not data:
                    break
                downloadable_file.write(data)
                
            print(f"File {file} downloaded successfully.")

except Exception as e:
    if 'sftp' in locals():
        sftp.close()
    if 'transport' in locals():
        transport.close()
    dbutils.notebook.exit(f"Error connecting to SFTP: {e}")
finally:
    if 'sftp' in locals():
        sftp.close()
    if 'transport' in locals():
        transport.close()

In [0]:
# Path to encrypted files and decryption output
mnt_pgp_path = "/dbfs/mnt/pgp/"
mnt_sas_path = "/dbfs/mnt/sas/"

with tempfile.TemporaryDirectory() as gnupghome:
    os.environ["GNUPGHOME"] = gnupghome
    gpg = gnupg.GPG()

    pgps = os.listdir(mnt_pgp_path)
    gpg_key_path = None

    for pgp_file in pgps:
        if pgp_file.endswith("SECRET.asc"):
            gpg_key_path = os.path.join(mnt_pgp_path, pgp_file)

    # Read and import the GPG key
    with open(gpg_key_path, "r") as key_file:
        key_data = key_file.read()
        import_result = gpg.import_keys(key_data)
        keyid = import_result.fingerprints[0]
        gpg.trust_keys(keyid, "TRUST_ULTIMATE")

    # List imported keys
    imported_keys = gpg.list_keys(secret=True)

    if not imported_keys:
        raise Exception("No secret key found. Ensure the correct PGP private key is imported.")
    else:
        print("Secret key successfully imported.")

    # Use variable where the stored downloaded encrypted file path in the sftp mount - 'stfp_files_path'
    for sftp_file in sftp_files_path: # Use the sftp mounted path variable from previous notebook
        sas_file = sftp_file.replace(mnt_sftp_path, mnt_sas_path).replace(".gpg", "").replace(".pgp", "")
        
        gpg_command = [
            'gpg',  
            '--output', sas_file,
            '--decrypt',
            sftp_file
        ]

        try:
            subprocess.run(gpg_command, check=True, capture_output=True, text=True)
            print(f"Decryption successful for file: {sftp_file}.")
            print(f"Decrypted file is in: {sas_file}.\n")
        except subprocess.CalledProcessError as e:
            print(f"Decryption failed for file: {sftp_file}.")