# Snowflake CDC DQ Investigation

The goal of this notebook is to understand on a very high level simple KPIs which gauge data quality  issues added inside the CRM Data Cloud.

In [1]:
import numpy as np
from numpy import random as rnd
from matplotlib import pyplot as plt
import warnings,datetime,time,math,itertools,os,sys

import torch
import torch.nn as nn
from torch.nn import functional as F

import pandas as pd
import plotly.express as px

from sklearn.preprocessing import LabelEncoder

import networkx as nx

import snowflake.connector

  warn_incompatible_dep(


In [2]:
sys.path.append('./../7_HELPERFUNCTIONS/')
from WorldSimulators.CRMDatabase import CRMDB

## Connect to CDC and fetch Metadata

Build connection framework and fetch Metadata Information.

In [3]:
conn = snowflake.connector.connect(
    user='jan-lucas.deinhard@siemens-healthineers.com',
    account='shsitdl.west-europe.azure',
    authenticator='externalbrowser'
)

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


In [4]:
# CRM Data Cloud Metadata
info = pd.read_sql("SHOW COLUMNS",con=conn)

In [5]:
conn.close()

# Metadata Preprocessing

Metadata added information, includes
* Flagging of test columns
* Full Name Creation
* In-database Name Creation
* Database hierarchy mapping
* First Occurence of table in Cloud

In [28]:
info['Test Schema'] = info['schema_name'].apply(lambda x: 'Y' if '_TEST' in x else 'N')

In [41]:
info['Full Tablename'] = info.apply(lambda x: x['database_name']+'.'+x['schema_name']+'.'+x['table_name'],axis=1)

In [81]:
info['In-Database Name'] = info.apply(lambda x: x['schema_name']+'.'+x['table_name'],axis=1)

In [82]:
database_hierarchy = {
    'ACCESSLAYER':0, 
    'CRMCL_CHECKIN':1,
    'CRMCL_SCREENING':2,
    'CRMCL_BOARDING':3,
    'CRMCL_TOPICAREA':4,
    'MARTLAYER':5
}

info['database_hierarchy'] = info['database_name'].apply(lambda x: database_hierarchy.get(x,-1))

In [None]:
info = info.merge(
    info[[
        'schema_name',
        'table_name',
        'database_hierarchy',
        'database_name'
    ]].drop_duplicates().sort_values(by=[
        'schema_name',
        'table_name',
        'database_hierarchy',
    ],ascending=True).drop(columns=['database_hierarchy']).groupby(['schema_name','table_name'],as_index=False).first(),
    on=['schema_name','table_name'],how='left'
).rename(columns={
    'database_name_x':'database_name',
    'database_name_y':'Table First Occurrence'
})

info['Full Tablename First Occurrence'] = info[['In-Database Name','Table First Occurrence']].apply(lambda x: x[1]+'.'+x[0],axis=1)

# Analysis part

Define tables to be checked for comparison, compare metadata for this set and set thresholds.

In [209]:
df0 = info[(info['database_name'].isin(['CRMCL_TOPICAREA']))&(info['Test Schema']=='N')].copy()

df0 = df0[['Full Tablename','Full Tablename First Occurrence']].drop_duplicates()

df0 = df0.merge(
    info[['Full Tablename','column_name']].groupby(['Full Tablename'],as_index=False).nunique(),
    on='Full Tablename',how='left'
).rename(columns={'column_name':'TopicArea Column Count'})

df0 = df0.merge(
    info[['Full Tablename','column_name']].groupby(['Full Tablename'],as_index=False).nunique(),
    left_on='Full Tablename First Occurrence',right_on='Full Tablename',how='left'
).drop(columns=['Full Tablename_y']).rename(columns={
    'column_name':'First Occurrence Column Count',
    'Full Tablename_x':'Full Tablename'
})

In [210]:
df0['Column Count Delta'] = df0[['TopicArea Column Count','First Occurrence Column Count']].apply(lambda x: x[0]/x[1]-1,axis=1)

In [211]:
df0.sort_values(by=['Column Count Delta'],ascending=False)

Unnamed: 0,Full Tablename,Full Tablename First Occurrence,TopicArea Column Count,First Occurrence Column Count,Column Count Delta
42,CRMCL_TOPICAREA.AUXFILES.Shipment_Test,CRMCL_CHECKIN.AUXFILES.Shipment_Test,4,1,3.000000
154,CRMCL_TOPICAREA.OSC.Opportunities,CRMCL_CHECKIN.OSC.Opportunities,93,37,1.513514
62,CRMCL_TOPICAREA.CPQ.IN_VITRO_Additional_Items,CRMCL_CHECKIN.CPQ.IN_VITRO_Additional_Items,51,21,1.428571
157,CRMCL_TOPICAREA.OSC.Projects,CRMCL_CHECKIN.OSC.Projects,115,51,1.254902
152,CRMCL_TOPICAREA.OSC.OLIs,CRMCL_CHECKIN.OSC.OLIs,133,63,1.111111
...,...,...,...,...,...
19,CRMCL_TOPICAREA.AUXFILES.Manual Adjustments,CRMCL_CHECKIN.AUXFILES.Manual Adjustments,14,14,0.000000
18,CRMCL_TOPICAREA.AUXFILES.LC_Currencies,CRMCL_SCREENING.AUXFILES.LC_Currencies,6,6,0.000000
0,CRMCL_TOPICAREA.ADOBE_ANALYTICS.ADOBE_WEB_ANAL...,CRMCL_CHECKIN.ADOBE_ANALYTICS.ADOBE_WEB_ANALYTICS,7,7,0.000000
90,CRMCL_TOPICAREA.GCR.Revenue_New_Orders,CRMCL_CHECKIN.GCR.Revenue_New_Orders,49,50,-0.020000


In [212]:
df0

Unnamed: 0,Full Tablename,Full Tablename First Occurrence,TopicArea Column Count,First Occurrence Column Count,Column Count Delta
0,CRMCL_TOPICAREA.ADOBE_ANALYTICS.ADOBE_WEB_ANAL...,CRMCL_CHECKIN.ADOBE_ANALYTICS.ADOBE_WEB_ANALYTICS,7,7,0.000000
1,CRMCL_TOPICAREA.AUXFILES.Account_Address_Geolo...,CRMCL_CHECKIN.AUXFILES.Account_Address_Geoloca...,19,19,0.000000
2,CRMCL_TOPICAREA.AUXFILES.Account_DefaultPerCou...,CRMCL_CHECKIN.AUXFILES.Account_DefaultPerCountry,2,2,0.000000
3,CRMCL_TOPICAREA.AUXFILES.Account_FreseniusSett...,CRMCL_CHECKIN.AUXFILES.Account_FreseniusSettings,5,5,0.000000
4,CRMCL_TOPICAREA.AUXFILES.Account_IB Factors,CRMCL_CHECKIN.AUXFILES.Account_IB Factors,6,6,0.000000
...,...,...,...,...,...
248,CRMCL_TOPICAREA.VARIAN.VarianAccountTerritorie...,CRMCL_BOARDING.VARIAN.VarianAccountTerritoriesMap,5,4,0.250000
249,CRMCL_TOPICAREA.VARIAN.VarianAccounts,CRMCL_CHECKIN.VARIAN.VarianAccounts,37,34,0.088235
250,CRMCL_TOPICAREA.VARIAN.VarianAssets,CRMCL_CHECKIN.VARIAN.VarianAssets,39,31,0.258065
251,CRMCL_TOPICAREA.VARIAN.VarianOpportunities,CRMCL_CHECKIN.VARIAN.VarianOpportunities,81,67,0.208955


In [213]:
conn = snowflake.connector.connect(
    user='jan-lucas.deinhard@siemens-healthineers.com',
    account='shsitdl.west-europe.azure',
    authenticator='externalbrowser'
)

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


In [241]:
size_query = '''
SELECT count(*) as ROWNUM FROM {0}
'''
L = []
for cindex,k in df0.iterrows():
    query_last = size_query.format('\"'+k['Full Tablename'].replace('.','\".\"')+'\"')
    query_first = size_query.format('\"'+k['Full Tablename First Occurrence'].replace('.','\".\"')+'\"')
    try:
        l = [pd.read_sql(query_last,con=conn).values[0][0],pd.read_sql(query_first,con=conn).values[0][0]]
    except:
        l = [np.nan,np.nan]
    L.append(l)

In [252]:
conn.close()

In [251]:
df0 = pd.concat([df0,pd.DataFrame(L)],axis=1)

In [255]:
df0 = df0.rename(columns={
    0:'TopicArea Row Count',
    1:'First Occurrence Row Count'
})

In [260]:
df0

Unnamed: 0,Full Tablename,Full Tablename First Occurrence,TopicArea Column Count,First Occurrence Column Count,Column Count Delta,TopicArea Row Count,First Occurrence Row Count
0,CRMCL_TOPICAREA.ADOBE_ANALYTICS.ADOBE_WEB_ANAL...,CRMCL_CHECKIN.ADOBE_ANALYTICS.ADOBE_WEB_ANALYTICS,7,7,0.000000,4274906.0,4289639.0
1,CRMCL_TOPICAREA.AUXFILES.Account_Address_Geolo...,CRMCL_CHECKIN.AUXFILES.Account_Address_Geoloca...,19,19,0.000000,419844.0,419844.0
2,CRMCL_TOPICAREA.AUXFILES.Account_DefaultPerCou...,CRMCL_CHECKIN.AUXFILES.Account_DefaultPerCountry,2,2,0.000000,286.0,286.0
3,CRMCL_TOPICAREA.AUXFILES.Account_FreseniusSett...,CRMCL_CHECKIN.AUXFILES.Account_FreseniusSettings,5,5,0.000000,206.0,206.0
4,CRMCL_TOPICAREA.AUXFILES.Account_IB Factors,CRMCL_CHECKIN.AUXFILES.Account_IB Factors,6,6,0.000000,647.0,647.0
...,...,...,...,...,...,...,...
248,CRMCL_TOPICAREA.VARIAN.VarianAccountTerritorie...,CRMCL_BOARDING.VARIAN.VarianAccountTerritoriesMap,5,4,0.250000,3270171.0,3270171.0
249,CRMCL_TOPICAREA.VARIAN.VarianAccounts,CRMCL_CHECKIN.VARIAN.VarianAccounts,37,34,0.088235,21365.0,21365.0
250,CRMCL_TOPICAREA.VARIAN.VarianAssets,CRMCL_CHECKIN.VARIAN.VarianAssets,39,31,0.258065,211422.0,211176.0
251,CRMCL_TOPICAREA.VARIAN.VarianOpportunities,CRMCL_CHECKIN.VARIAN.VarianOpportunities,81,67,0.208955,235231.0,235231.0


In [None]:
df0['Row Count Delta'] = df0[['TopicArea Row Count','First Occurrence Row Count']].apply(lambda x: x[0]/x[1]-1,axis=1)

In [265]:
df0['Row Count Delta'].replace([np.inf,-np.inf],np.nan,inplace=True)

In [268]:
df0[~df0['Row Count Delta'].isna()].sort_values(by=['Row Count Delta'],ascending=False).head(25)

Unnamed: 0,Full Tablename,Full Tablename First Occurrence,TopicArea Column Count,First Occurrence Column Count,Column Count Delta,TopicArea Row Count,First Occurrence Row Count,Row Count Delta
98,CRMCL_TOPICAREA.INFORMATION_SCHEMA.FILE_FORMATS,ANALYTICSLAYER.INFORMATION_SCHEMA.FILE_FORMATS,22,22,0.0,26.0,1.0,25.0
107,CRMCL_TOPICAREA.INFORMATION_SCHEMA.SCHEMATA,ANALYTICSLAYER.INFORMATION_SCHEMA.SCHEMATA,13,13,0.0,36.0,2.0,17.0
109,CRMCL_TOPICAREA.INFORMATION_SCHEMA.STAGES,ANALYTICSLAYER.INFORMATION_SCHEMA.STAGES,10,10,0.0,13.0,1.0,12.0
115,CRMCL_TOPICAREA.INFORMATION_SCHEMA.VIEWS,ANALYTICSLAYER.INFORMATION_SCHEMA.VIEWS,12,12,0.0,74.0,23.0,2.217391
127,CRMCL_TOPICAREA.OSC.Accounts,CRMCL_CHECKIN.OSC.Accounts,107,93,0.150538,589467.0,588225.0,0.002111
250,CRMCL_TOPICAREA.VARIAN.VarianAssets,CRMCL_CHECKIN.VARIAN.VarianAssets,39,31,0.258065,211422.0,211176.0,0.001165
136,CRMCL_TOPICAREA.OSC.Business_Plan_Objective,CRMCL_CHECKIN.OSC.Business_Plan_Objective,10,10,0.0,47082.0,47082.0,0.0
129,CRMCL_TOPICAREA.OSC.Activity_Assignees,CRMCL_CHECKIN.OSC.Activity_Assignees,10,10,0.0,2218625.0,2218625.0,0.0
130,CRMCL_TOPICAREA.OSC.Activity_Contacts,CRMCL_CHECKIN.OSC.Activity_Contacts,9,9,0.0,738909.0,738909.0,0.0
131,CRMCL_TOPICAREA.OSC.Activity_Objectives,CRMCL_CHECKIN.OSC.Activity_Objectives,8,8,0.0,71.0,71.0,0.0
