In [25]:
# General Imports

import numpy as np
from numpy import random as rnd
from matplotlib import pyplot as plt
import os,sys,datetime,time,math, warnings,itertools

import pandas as pd
from contextlib import contextmanager

# Snowpark imports
from snowflake.snowpark.session import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark.window import Window

In [62]:
def read_single_lineage_extract(excel_path):
    """
    Read and process an Informatica Lineage Excel extract for a single lineage.

    This function reads an Informatica Lineage Excel extract file, performs post-processing
    specific to this file type, and returns a DataFrame containing relevant lineage information.

    Args:
        excel_path (str): The file path to the Informatica Lineage Excel extract.

    Returns:
        pandas.DataFrame: A DataFrame containing lineage information with the following columns:
            - 'Accounts'
            - 'Martlayer.CRMCLOUD_OSC.Accounts'
            - 'SF_CRM_DataMarts/MARTLAYER/CRMCLOUD_OSC/Accounts/'
            - 'Source File'

    Note:
        The function expects the input Excel file to have a specific structure as defined by Informatica Lineage.
        The function will skip the first two rows and use the third row as the header row for reading the DataFrame.
        Post-processing is applied to filter the DataFrame based on the 'SF_CRM_DataMarts' column and select specific
        columns related to the 'Accounts' lineage. The 'Source File' column will be added to the DataFrame with the value
        of the 'excel_path' argument.

    Example:
        excel_path = 'path/to/lineage_extract.xlsx'
        lineage_df = read_single_lineage_extract(excel_path)
    """
    # Expects file structure as defined by Informatica Lineage output
    df = pd.read_excel(excel_path,skiprows=2,header=1)
    # Postprocessing defined for this specific file type
    df = df[df['Resource Name']=='SF_CRM_DataMarts'].filter(items=[
        'Asset Name',
        'Business_Terms',
        'Path'
    ],axis=1)
    df['DP Table'] = excel_path.replace('.xls','').replace('.xlsx','')
    return df

In [63]:
@contextmanager
def cwd(path):
    """
    Context manager to temporarily change the current working directory.

    This context manager changes the current working directory to the specified 'path'
    while the context is active. After the context exits, the original working directory
    is restored.

    Args:
        path (str): The path to the directory to which the current working directory should be changed.

    Usage:
        with cwd('/path/to/new/directory'):
            # Code executed within this block will have the current working directory changed.
            # After the block exits, the original working directory is restored.

    Example:
        with cwd('/home/user/documents'):
            file_list = os.listdir()  # List files in the '/home/user/documents' directory
        # 
    """
    oldpwd = os.getcwd()
    os.chdir(path)
    try: yield
    finally: os.chdir(oldpwd)

In [130]:
# Path settings
if sys.platform.startswith('win'):
    excel_input_directory = r'../../TF_Data/Dropbox/PhD Prep/DQ Framework - Clustering/Data Products/Informatica Lineage Status Funnel Management 11.08.2023/'
    excel_output_directory = r'../../TF_Data/Dropbox/PhD Prep/DQ Framework - Clustering/Data Products/'
    shsdq_directory = r'../../TF_Data/Dropbox/PhD Prep/SHS DQ/'

In [119]:
# Read Excel files to construct columns in data product
with cwd(excel_input_directory):
    df = pd.DataFrame()
    # Iterate tables in Data Product
    for dp_member in os.listdir():
        df = pd.concat([df,read_single_lineage_extract(dp_member)],axis=0)
    df = df.reset_index(drop=True)

# Post-processing
df['Path_tmp'] = df['Path'].str.split('/')
df[['TABLE_CATALOG','TABLE_SCHEMA','TABLE_NAME']] = pd.DataFrame(df.Path_tmp.tolist(),index=df.index).drop(columns=[0,4])
df = df.drop(columns=['Path_tmp','Path'])
df['TABLE_PATH'] = df[['TABLE_CATALOG','TABLE_SCHEMA','TABLE_NAME']].apply(lambda x: '\"'+x[0]+'\".\"'+x[1]+'\".\"'+x[2]+'\"',axis=1)

In [120]:
# Determine centrality of each Martlayer table
ddf = df.groupby(['TABLE_PATH'],as_index=False)['DP Table'].nunique().assign(
    new=df['DP Table'].nunique()
).rename(
    columns={
        'DP Table':'DP Tables using TABLE',
        'new':'Total DP Tables'
    }
)

# Calculate Centrality Factor
ddf['Centrality Factor'] = ddf[['DP Tables using TABLE','Total DP Tables']].apply(lambda x: x[0]/x[1],axis=1)

In [122]:
# Connection Parameters
connection_parameters = {
    'account':'shsitdl.west-europe.azure',
    'user':'jan-lucas.deinhard@siemens-healthineers.com',
    'authenticator':'externalbrowser',
    'role':'FR_CRMCLOUD_DEV',
    'database':'MARTLAYER',
    'schema':'INFORMATION_SCHEMA',
    'warehouse':'W_CRMCLOUD_P'
}

# Establish Connection
session = Session.builder.configs(connection_parameters).create()

cC = session.table("COLUMNS")
C = pd.DataFrame(cC.collect())

session.close()

# Merge all columns into the DP-relevant tables
dq = df.merge(
    C[['TABLE_CATALOG','TABLE_SCHEMA','TABLE_NAME','COLUMN_NAME']].drop_duplicates(),
    on=['TABLE_CATALOG','TABLE_SCHEMA','TABLE_NAME'],
    how='left'
)

In [126]:
# Load DQ Flags file
with cwd(shsdq_directory):
    dqf = pd.read_excel('DQ Flags in CDC.xlsx')
# Filter the columns to the DQ Flags
dq = dq.merge(dqf[['Name']].drop_duplicates().rename(columns={'Name':'COLUMN_NAME'}),on='COLUMN_NAME',how='inner')

In [133]:
# Write result to Excel file
with cwd(excel_output_directory), pd.ExcelWriter('Funnel Management Data Product.xlsx') as writer:
    df.to_excel(writer, sheet_name='Data Assets per Data Product',index=False)
    ddf.to_excel(writer, sheet_name='Relevance per Data Asset',index=False)
    dq.to_excel(writer, sheet_name='DQ Flags per Data Product',index=False)