# Snowflake Data Exploration

The goal of this notebook is to support the design of a research hypothesis suitable for the CRM Data Cloud.

In [2]:
import numpy as np
from numpy import random as rnd
from matplotlib import pyplot as plt
import warnings,datetime,time,math,itertools,os,sys

import torch
import torch.nn as nn
from torch.nn import functional as F

import pandas as pd
import plotly.express as px

from sklearn.preprocessing import LabelEncoder

import networkx as nx

import snowflake.connector

  warn_incompatible_dep(


In [3]:
conn = snowflake.connector.connect(
    user='jan-lucas.deinhard@siemens-healthineers.com',
    account='shsitdl.west-europe.azure',
    authenticator='externalbrowser'
)

In [5]:
metadataQuery = '''
SELECT *
FROM "{0}"."INFORMATION_SCHEMA"."COLUMNS" 
'''

In [6]:
ZONES = [
    'CRMCL_CHECKIN',
    'CRMCL_SCREENING',
    'CRMCL_BOARDING',
    'CRMCL_TOPICAREA'
]

df = pd.DataFrame()

for item in ZONES:
    tf = pd.read_sql(
        sql=metadataQuery.format(item),
        con=conn
    )
    df = pd.concat([df,tf])

In [7]:
df.to_excel('C:\\Users\\z003mxpm\\Desktop\\extract\\metadata.xlsx')

In [8]:
np.sort(df[df['TABLE_SCHEMA'].apply(lambda x: 'TEST' not in x)]['TABLE_SCHEMA'].unique())

array(['ADOBE_ANALYTICS', 'AUXFILES', 'BLISS', 'CCT', 'CDCADMIN', 'CNJ',
       'CPQ', 'DOMO_RAW', 'ELOQUA', 'GAMA', 'GCR', 'INFORMATION_SCHEMA',
       'MARKETING', 'MCMN', 'ONELIBRARY', 'ONE_LIBRARY', 'OPAL', 'OSC',
       'PLAYGROUND', 'PRM', 'RLS', 'SDTB', 'SHARE', 'VARIAN'],
      dtype=object)

In [9]:
SCHEMAS = [
    'ADOBE_ANALYTICS', 
    'AUXFILES', 
    'BLISS', 
    #'CCT', 
    #'CDCADMIN', 
    #'CNJ',
    'CPQ', 
    'DOMO_RAW', 
    'ELOQUA', 
    #'GAMA', 
    'GCR', 
    #'INFORMATION_SCHEMA',
    #'MARKETING', 
    'MCMN', 
    'ONELIBRARY', 
    'ONE_LIBRARY', 
    #'OPAL', 
    'OSC',
    #'PLAYGROUND', 
    #'PRM', 
    #'RLS', 
    'SDTB', 
    'SHARE', 
    'VARIAN'
]

In [10]:
TABLES = [
    'ACTIVE_QUOTES_HEADER', 
    'IN_VITRO_Price_Reagents',
    'IN_VIVO_Quote_Details', 
    'New_Eloqua_Contact',
    'Account to Account Relationships', 
    'Contacts',
    'Accounts',
    'Opportunities',
    'SDTB_IBD_Complete',
    'SDTB_IBD_Notification', 
    'IN_VITRO_Contract_KPIs',
    'IN_VITRO_Diagnostic_Parameters',
    'IN_VIVO_QuoteItem_Costs', 
    'Order_Details',
    'Account_Customer_Operating_Budget',
    'Installed Base', 
    'Leads',
    'CS Installed Base (Merged with OSC)',
    'VarianAccounts', 
    'Projects',
    'VarianAssets', 
    'VarianOpportunities',
    'Regional_Hierarchy', 
    'CS_Quote_Details', 
    'Revenue_New_Orders',
    'OLIs',
    'Products', 
    'ADOBE_WEB_ANALYTICS', 
    'ALL_QUOTES_HEADER',
    'IN_VITRO_Systems_per_Enduser', 
    'IN_VITRO_Instrument_Automation_to_Systems',
    'IN_VITRO_Price_Consumables', 
    'IN_VITRO_Additional_Items',
    'Eloqua_LinkedIn_Activity',
    'OneLibrary_Activities',
    'Contact to Account Relationships','Partners',
    'SDTB_IBD',
    'IN_VITRO_GPO_Reagent_Test_Vol_Tier_Setup',
    'IN_VITRO_Price_Service', 
    'Account_All_Addresses',
    'Contacts_US', 
    'IN_VITRO_Price_Equipment_Software', 
    'IN_VITRO_Contract', 
    'ELOQUA_OSC_MATCHING',
    'Account_Share_Of_Wallet',
    'Eloqua_Activities',
    'Account_All_Phones',
    'Manual Adjustments',
    'Assets',
    'Account_Address_Geolocations',
    'IN_VITRO_Reagents_per_Enduser',
    'Organizational_Mapping',
    'Account_Contact_Relationships',
    'Employees',
    'MVP_Consent_Forms',
    'Orders',
    'AccountsStructure',
    'IN_VITRO_SurchargeFactor',
    'IN_VITRO_Consumables_per_Enduser',
    'IN_VITRO_Atellica_Solution_Configurations',
    'Eloqua_OSC_MVPspecific_ManualMatches',
    'Account_ShipToUsage', 'Account_All_EMails',
    'Account to Contact Relationships',
    'Contact_Marketing_Attributes',
    'VARIAN_SHS_MATCHING',
    'TerritoryAccountRelationship',
    'Account_All_URLs',
    'Accounts_ExcelDefinedAccounts',
    'IN_VITRO_AdditionalItemsFactor',
    'Contact to Contact Relationships',
    'Varian_Account_Address_Geolocations', 
    'Account_Marketing_Attributes', 
    'Account_InternationalLogic',
    'Finance_FX_Rates'
]

In [11]:
FULLPATH_LIST = ['\"'+k+'\"' for k in df[df['TABLE_CATALOG'].isin(['CRMCL_TOPICAREA'])&df['TABLE_SCHEMA'].isin(SCHEMAS)&df['TABLE_NAME'].isin(TABLES)][['TABLE_CATALOG','TABLE_SCHEMA','TABLE_NAME']].agg('"."'.join,axis=1).tolist()]

In [12]:
sqlQuery = '''
    SELECT * FROM {0} SAMPLE ROW (1000 rows)
'''

In [13]:
len(FULLPATH_LIST)

3426

In [14]:
ctr = 1

for item in FULLPATH_LIST:
    if ctr%100==0: print('Processed {0}/{1} items, current item -> {2}'.format(ctr,len(FULLPATH_LIST),item))
    ctr += 1
    try:
        tf = pd.read_sql(
            sql=sqlQuery.format(item),
            con=conn
        )
        tf.to_excel('C:\\Users\\z003mxpm\\Desktop\\extract\\'+item.replace('"."','_').replace('"','')+'.xlsx')
    except:
        with open('C:\\Users\\z003mxpm\\Desktop\\extract\\skipped.txt','a') as f:
            f.write(item)

Processed 100/3426 items, current item -> "CRMCL_TOPICAREA"."OSC"."Assets"
Processed 200/3426 items, current item -> "CRMCL_TOPICAREA"."CPQ"."IN_VIVO_Quote_Details"
Processed 300/3426 items, current item -> "CRMCL_TOPICAREA"."CPQ"."IN_VITRO_Price_Consumables"
Processed 400/3426 items, current item -> "CRMCL_TOPICAREA"."OSC"."Accounts"
Processed 500/3426 items, current item -> "CRMCL_TOPICAREA"."CPQ"."Orders"
Processed 600/3426 items, current item -> "CRMCL_TOPICAREA"."OSC"."Accounts"
Processed 700/3426 items, current item -> "CRMCL_TOPICAREA"."OSC"."Account_All_EMails"
Processed 800/3426 items, current item -> "CRMCL_TOPICAREA"."CPQ"."IN_VITRO_Additional_Items"
Processed 900/3426 items, current item -> "CRMCL_TOPICAREA"."CPQ"."CS_Quote_Details"
Processed 1000/3426 items, current item -> "CRMCL_TOPICAREA"."CPQ"."IN_VITRO_Price_Service"
Processed 1100/3426 items, current item -> "CRMCL_TOPICAREA"."OSC"."Contacts"
Processed 1200/3426 items, current item -> "CRMCL_TOPICAREA"."OSC"."Account