# Snowflake Data Exploration

The goal of this notebook is to support the design of a research hypothesis suitable for the CRM Data Cloud.

In [2]:
import numpy as np
from numpy import random as rnd
from matplotlib import pyplot as plt
import warnings,datetime,time,math,itertools,os,sys

import torch
import torch.nn as nn
from torch.nn import functional as F

import pandas as pd
import plotly as pl

import networkx as nx
import seaborn as sns

import snowflake.connector

  warn_incompatible_dep(


## Connect to Snowflake

Build connection framework for pulling data from Snowflake.

In [3]:
conn = snowflake.connector.connect(
    user='jan-lucas.deinhard@siemens-healthineers.com',
    account='shsitdl.west-europe.azure',
    authenticator='externalbrowser'
)

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


In [4]:
# CRM Data Cloud Metadata

info = pd.read_sql("SHOW COLUMNS",con=conn)

### Let's select a subset of CKA accounts of interest:

In [5]:
# Construct query
accID_query_ = '''
SELECT DISTINCT "ACCOUNT_ID" 
FROM CRMCL_TOPICAREA.OSC."Accounts" 
WHERE "ACCOUNT_ID (MainAccount)" IN (
SELECT DISTINCT TOP 5 "ACCOUNT_ID (MainAccount)" 
FROM CRMCL_TOPICAREA.OSC."Accounts" WHERE "CKAP (Flag)" =\'Y\'
)
'''

# Run query
acc_subset_ = '\''+'\',\''.join([str(k) for k in pd.read_sql(accID_query_,con=conn)['ACCOUNT_ID'].tolist()])+'\''

### Select other entities
* Further accounts info
* Opptys
* OLIs
* Leads
* Vitro Quotes
* Vivo Quotes
* Services Quotes
* Assets IB
* SDTB IB
* GCR Revenue Data
* Orders

In [6]:
acc_sql = '''
SELECT
"ACCOUNT_ID",
"ACCOUNT_ID (MainAccount)",
"ACCOUNT_ID (Parent)",
"Account Address",
"Account Country",
"Account Name",
"Account Name (MainAccount)",
"Account Postal Code",
"Blocking Status",
"CKAP (Flag)",
"Is Partner (Flag)",
"Legal Structure",
"OSC Account Number",
"RKAP (Flag)",
"SAP-ID",
"SAP-ID In Vitro"
FROM CRMCL_TOPICAREA.OSC."Accounts" 
WHERE "ACCOUNT_ID" IN (
{0}
)
'''.format(acc_subset_)
acc_frame = pd.read_sql(acc_sql,con=conn)

In [7]:
oppty_sql = '''
SELECT
"ACCOUNT_ID",
"Calculated Expected Turnover Date (earliest)",
"Calculated Expected Turnover Date (latest)",
"Calculated Revenue Start Date (earliest)",
"Calculated Revenue Start Date (latest)",
"Calculated Shipment Date (earliest)",
"Calculated Shipment Date (latest)",
"Direct/Partner Business",
"Indirect Business Type",
"OLI.Close Date (earliest)",
"OLI.Close Date (latest)",
"OPPORTUNITY_ID",
"OPPORTUNITY_ID (Alternate Opportunity)",
"OPPORTUNITY_ID (Linked Opportunity)",
"Opportunity Expired (Flag)",
"Oppty Name",
"Oppty Number",
"Oppty Type",
"Oppty.Close Date",
"Oppty.Created Timestamp",
"Oppty.Forecast Status (Consolidated)",
"Oppty.IT/Automation Included (Flag)",
"Oppty.Lease End Date",
"Oppty.Project Progress",
"Oppty.Quote Number",
"Oppty.Status (Consolidated)",
"Part of ES Project (Flag)",
"Sale Type (Oppty Level)",
"Sales Program (Oppty Level)"
FROM CRMCL_TOPICAREA.OSC."Opportunities" 
WHERE "ACCOUNT_ID" IN (
{0}
)
'''.format(acc_subset_)
oppty_frame = pd.read_sql(oppty_sql,con=conn)

In [8]:
oli_sql = '''
SELECT
"ACCOUNT_ID",
"ACCOUNT_ID (End User)",
"Alternate (Flag)",
"Calculated Expected Turnover Date",
"Calculated Revenue Start Date",
"Calculated Shipment Date",
"Capital Funnel Value (GC)",
"Contract Duration (Months)",
"Contract Effective Funnel Value (GC)",
"DQ Issue (Comment)",
"DQ Issue (Flag)",
"Decision Date",
"Expected Delivery Date",
"Expected Installation Date",
"Expected Shipment Date",
"Expected Turnover Date",
"Expired (Flag)",
"FV Units",
"Finance Type",
"Funnel Status",
"Funnel Value (GC)",
"Funnel Value (LC)",
"Ghost (Flag)",
"Include in Forecast (Flag)",
"Incumbent Vendor",
"Lease End Date",
"Lost w FC (Flag)",
"Monthly Recurring Funnel Value (GC)",
"Multiple Quotes attached (Flag)",
"OLI.Business Area",
"OLI.Business Line",
"OLI.Close Date",
"OLI.Created Timestamp",
"OLI.Quantity",
"OLI.Status",
"OLI.Status (Grouped)",
"OLI_ID",
"OPPORTUNITY_ID",
"Prior Year Revenue Run Rate (GC)",
"Product Number",
"Recurring Expected (Flag)",
"Recurring Revenue Funnel (GC)",
"Sale Type (Grouped)",
"Sales Channel",
"Sales Program",
"Siemens Probability"
FROM CRMCL_TOPICAREA.OSC."OLIs" 
WHERE "ACCOUNT_ID" IN (
{0}
)
'''.format(acc_subset_)
oli_frame = pd.read_sql(oli_sql,con=conn)

In [9]:
leads_sql = '''
SELECT
"ACCOUNT_ID",
"ASSET_ID",
"Is Partner (Flag)",
"LEAD_ID",
"Last Automatic Assignment Date",
"Lead Age in Days",
"Lead Origin",
"Lead.Close Date",
"Lead.Contact Email",
"Lead.Contact First Name",
"Lead.Contact Job Role",
"Lead.Contact Job Title",
"Lead.Contact Last Name",
"Lead.Contact Postal Code",
"Lead.Country",
"Lead.Created Timestamp",
"Lead.FL Number",
"Lead.IB Vendor",
"Lead.Last Updated Timestamp",
"Lead.Opportunity Type",
"Lead.Sale Type",
"Lead.Sales Product Line",
"Lead.Status",
"PRODUCT_ID"
FROM CRMCL_TOPICAREA.OSC."Leads" 
WHERE "ACCOUNT_ID" IN (
{0}
)
'''.format(acc_subset_)
leads_frame = pd.read_sql(leads_sql,con=conn)

In [11]:
qvitro_sql = '''
SELECT
header."ACCOUNT_ID",
header."CPQ.Last Updated Timestamp",
header."CPR (Flag)",
header."DSO (Quote)",
header."DXCON.Approval Level",
header."DXCON.CPR Deal (Flag)",
header."DXCON.Configuration Approved (Flag)",
header."DXCON.Deal Billing Option",
header."DXCON.Deal Finance Option",
header."DXCON.Deal Finance Type",
header."DXCON.Deal Locked (Flag)",
header."DXCON.Last Saved Timestamp",
header."DXCON.Manual Discount (Flag)",
header."DXCON.Pricing Approved (Flag)",
header."DXCON.Sales Contract Duration (in months)",
header."Data Issue Comment",
header."Data Quality Issue (Flag)",
header."Has Automation (Flag)",
header."Has Instrument (Flag)",
header."Implausible Cost (Flag)",
header."Implausible Revenue (Flag)",
header."OPPORTUNITY_ID",
header."Opportunity Expired (Flag)",
header."Possible Test Quote (Flag)",
header."Quote Active (Flag)",
header."Quote Number and Revision",
header."Quote Status",
header."Quote Type",
header."Quote.Business Type",
header."Quote.Created Timestamp",
header."Quote.ISO",
header."Tender Type",
SUM(kpis."Contract Total Product Cost (GC)") AS "Total Cost (GC)",
SUM(kpis."Contract Total Revenue (GC)") AS "Total Revenue (GC)"
FROM CRMCL_TOPICAREA.CPQ.ACTIVE_QUOTES_HEADER header
LEFT JOIN CRMCL_TOPICAREA.CPQ."IN_VITRO_Contract_KPIs" kpis ON header."Quote Number and Revision"=kpis."Quote Number and Revision" 
WHERE header."Quote Type" = 'IN_VITRO'
AND header."ACCOUNT_ID" IN (
{0}
)
GROUP BY 
header."ACCOUNT_ID",
header."CPQ.Last Updated Timestamp",
header."CPR (Flag)",
header."DSO (Quote)",
header."DXCON.Approval Level",
header."DXCON.CPR Deal (Flag)",
header."DXCON.Configuration Approved (Flag)",
header."DXCON.Deal Billing Option",
header."DXCON.Deal Finance Option",
header."DXCON.Deal Finance Type",
header."DXCON.Deal Locked (Flag)",
header."DXCON.Last Saved Timestamp",
header."DXCON.Manual Discount (Flag)",
header."DXCON.Pricing Approved (Flag)",
header."DXCON.Sales Contract Duration (in months)",
header."Data Issue Comment",
header."Data Quality Issue (Flag)",
header."Has Automation (Flag)",
header."Has Instrument (Flag)",
header."Implausible Cost (Flag)",
header."Implausible Revenue (Flag)",
header."OPPORTUNITY_ID",
header."Opportunity Expired (Flag)",
header."Possible Test Quote (Flag)",
header."Quote Active (Flag)",
header."Quote Number and Revision",
header."Quote Status",
header."Quote Type",
header."Quote.Business Type",
header."Quote.Created Timestamp",
header."Quote.ISO",
header."Tender Type"
'''.format(acc_subset_)
qvitro_frame = pd.read_sql(qvitro_sql,con=conn)

In [12]:
qvivo_sql = '''
SELECT
header."ACCOUNT_ID",
header."CPQ.Last Updated Timestamp",
header."CPR (Flag)",
header."DSO (Quote)",
header."DXCON.Approval Level",
header."DXCON.CPR Deal (Flag)",
header."DXCON.Configuration Approved (Flag)",
header."DXCON.Deal Billing Option",
header."DXCON.Deal Finance Option",
header."DXCON.Deal Finance Type",
header."DXCON.Deal Locked (Flag)",
header."DXCON.Last Saved Timestamp",
header."DXCON.Manual Discount (Flag)",
header."DXCON.Pricing Approved (Flag)",
header."DXCON.Sales Contract Duration (in months)",
header."Data Issue Comment",
header."Data Quality Issue (Flag)",
header."Has Automation (Flag)",
header."Has Instrument (Flag)",
header."Implausible Cost (Flag)",
header."Implausible Revenue (Flag)",
header."OPPORTUNITY_ID",
header."Opportunity Expired (Flag)",
header."Possible Test Quote (Flag)",
header."Quote Active (Flag)",
header."Quote Number and Revision",
header."Quote Status",
header."Quote Type",
header."Quote.Business Type",
header."Quote.Created Timestamp",
header."Quote.ISO",
header."Tender Type",
SUM(costs."Amount (GC)"*costs."Quantity") AS "Total Cost (GC)",
SUM(rev."Quote Ext. Net Price (GC)"*rev."Quantity") AS "Total Revenue (GC)"
FROM CRMCL_TOPICAREA.CPQ.ACTIVE_QUOTES_HEADER header
LEFT JOIN CRMCL_TOPICAREA.CPQ."IN_VIVO_Quote_Details" rev ON header."Quote Number and Revision"=rev."Quote Number and Revision" 
LEFT JOIN CRMCL_TOPICAREA.CPQ."IN_VIVO_QuoteItem_Costs" costs ON header."Quote Number and Revision" = costs."Quote Number and Revision" 
WHERE header."Quote Type" = 'IN_VIVO'
AND header."ACCOUNT_ID" IN (
{0}
)
GROUP BY 
header."ACCOUNT_ID",
header."CPQ.Last Updated Timestamp",
header."CPR (Flag)",
header."DSO (Quote)",
header."DXCON.Approval Level",
header."DXCON.CPR Deal (Flag)",
header."DXCON.Configuration Approved (Flag)",
header."DXCON.Deal Billing Option",
header."DXCON.Deal Finance Option",
header."DXCON.Deal Finance Type",
header."DXCON.Deal Locked (Flag)",
header."DXCON.Last Saved Timestamp",
header."DXCON.Manual Discount (Flag)",
header."DXCON.Pricing Approved (Flag)",
header."DXCON.Sales Contract Duration (in months)",
header."Data Issue Comment",
header."Data Quality Issue (Flag)",
header."Has Automation (Flag)",
header."Has Instrument (Flag)",
header."Implausible Cost (Flag)",
header."Implausible Revenue (Flag)",
header."OPPORTUNITY_ID",
header."Opportunity Expired (Flag)",
header."Possible Test Quote (Flag)",
header."Quote Active (Flag)",
header."Quote Number and Revision",
header."Quote Status",
header."Quote Type",
header."Quote.Business Type",
header."Quote.Created Timestamp",
header."Quote.ISO",
header."Tender Type"
'''.format(acc_subset_)
qvivo_frame = pd.read_sql(qvivo_sql,con=conn)

In [26]:
qservice_sql = '''
SELECT
header."ACCOUNT_ID",
header."CPQ.Last Updated Timestamp",
header."CPR (Flag)",
header."DSO (Quote)",
header."DXCON.Approval Level",
header."DXCON.CPR Deal (Flag)",
header."DXCON.Configuration Approved (Flag)",
header."DXCON.Deal Billing Option",
header."DXCON.Deal Finance Option",
header."DXCON.Deal Finance Type",
header."DXCON.Deal Locked (Flag)",
header."DXCON.Last Saved Timestamp",
header."DXCON.Manual Discount (Flag)",
header."DXCON.Pricing Approved (Flag)",
header."DXCON.Sales Contract Duration (in months)",
header."Data Issue Comment",
header."Data Quality Issue (Flag)",
header."Has Automation (Flag)",
header."Has Instrument (Flag)",
header."Implausible Cost (Flag)",
header."Implausible Revenue (Flag)",
header."OPPORTUNITY_ID",
header."Opportunity Expired (Flag)",
header."Possible Test Quote (Flag)",
header."Quote Active (Flag)",
header."Quote Number and Revision",
header."Quote Status",
header."Quote Type",
header."Quote.Business Type",
header."Quote.Created Timestamp",
header."Quote.ISO",
header."Tender Type",
0 AS "Total Cost (GC)",
0 AS "Total Revenue (GC)"
FROM CRMCL_TOPICAREA.CPQ.ACTIVE_QUOTES_HEADER header
LEFT JOIN CRMCL_TOPICAREA.CPQ."IN_VIVO_Quote_Details" rev ON header."Quote Number and Revision"=rev."Quote Number and Revision" 
LEFT JOIN CRMCL_TOPICAREA.CPQ."IN_VIVO_QuoteItem_Costs" costs ON header."Quote Number and Revision" = costs."Quote Number and Revision" 
WHERE header."Quote Type" = 'SERVICES'
AND header."ACCOUNT_ID" IN (
{0}
)
'''.format(acc_subset_)
qservice_sql = pd.read_sql(qservice_sql,con=conn)

In [17]:
ib_asset_sql = '''
SELECT DISTINCT
"ACCOUNT_ID",
"ACCOUNT_ID (End User)",
"ASSET_ID",
"Asset Age (days)",
"Asset Age (months)",
"Asset Age (years)",
"Asset Equipment Eos Date",
"Asset FL Number",
"Asset Install Year",
"Asset Last Service Activity Date",
"Asset Material Number",
"Asset Service Contract End Date",
"Asset Service Contract Start Date",
"Asset Ship-to Account (SAP)",
"Asset Shipped Date",
"Asset System EoS Date",
"Asset Waranty End Date",
"Asset.Business Area",
"Asset.Business Line",
"Asset.Contract (Flag)",
"Asset.Created Timestamp",
"Asset.Last Updated Timestamp",
"Asset.Sales Product Line",
"Asset.Status",
"Competitor Asset (Flag)",
"EoS next 2 years & no Oppty (Flag)",
"EoS planned & no Oppty or no Strategy (Flag)",
"EoS reached & no Oppty (Flag)",
"Equipment Number",
"Evolve Capable (Flag)",
"IB Vendor Name",
"IBR - Expected Replacement Date",
"MTDA (Flag)",
"OLI_ID (Created by)",
"PRODUCT_ID"
FROM CRMCL_TOPICAREA.OSC."Assets" 
WHERE "ACCOUNT_ID" IN (
{0}
)
'''.format(acc_subset_)
ib_asset_frame = pd.read_sql(ib_asset_sql,con=conn)

In [19]:
ib_sdtb_sql = '''
SELECT 
"ACCOUNT_ID",
"Asset FL Number",
YEAR("Asset Installation Date") AS "Asset Install Year",
"Asset Material Number",
"Asset Service Contract End Date",
"Asset Service Contract Start Date",
"Asset System EoS Date",
"Asset Waranty End Date",
"Last Updated Date" AS "Asset.Last Updated Timestamp",
"Equipment Status" AS "Asset.Status",
"Equipment Number",
"IB Product" AS "PRODUCT_ID"
FROM CRMCL_TOPICAREA.SDTB.SDTB_IBD 
WHERE "Deleted Asset (Flag)" <> 'Y' 
AND "Demo (Flag)" <> 'Y'
AND "ACCOUNT_ID" IN (
{0}
)
'''.format(acc_subset_)
ib_sdtb_frame = pd.read_sql(ib_sdtb_sql,con=conn)

In [31]:
gcr_sql = '''
SELECT 
"ACCOUNT_ID",
"ACCOUNT_ID (Payer)",
"ACCOUNT_ID (ShipTo)",
"ACCOUNT_ID (SoldTo)",
"Business Area",
"Business Line",
"Business Type",
"Cross Div. Business",
"Diagnostic Parameter",
"Disease State",
"FY",
"FY Period",
"GCR Reporting Date",
"ISO",
"Product Line",
"Product/Global Material Number",
"Sales Document Number HQ",
"Sales Document Number RC",
"Ship-to ID (SAP)",
"Sold-to ID (SAP)",
SUM("New Orders Outside Healthcare (GC PY Comparable)") AS "New Orders Outside Healthcare (GC PY Comparable)",
SUM("New Orders Outside Healthcare (GC)") AS "New Orders Outside Healthcare (GC)",
SUM("Revenue Outside Healthcare (GC PY Comparable)") AS "Revenue Outside Healthcare (GC PY Comparable)",
SUM("Revenue Outside Healthcare (GC)") AS "Revenue Outside Healthcare (GC)"
FROM CRMCL_TOPICAREA.GCR."Revenue_New_Orders" 
WHERE "ACCOUNT_ID" IN (
{0}
)
GROUP BY 
"ACCOUNT_ID",
"ACCOUNT_ID (Payer)",
"ACCOUNT_ID (ShipTo)",
"ACCOUNT_ID (SoldTo)",
"Business Area",
"Business Line",
"Business Type",
"Cross Div. Business",
"Diagnostic Parameter",
"Disease State",
"FY",
"FY Period",
"GCR Reporting Date",
"ISO",
"Product Line",
"Product/Global Material Number",
"Sales Document Number HQ",
"Sales Document Number RC",
"Ship-to ID (SAP)",
"Sold-to ID (SAP)"
'''.format(acc_subset_)
gcr_frame = pd.read_sql(gcr_sql,con=conn)

In [70]:
orders_sql = '''
SELECT
qheader."ACCOUNT_ID",
header."Active (Flag)",
header."ISO",
header."Order Date",
header."Order Status",
header."Quote Number and Revision",
header."Sales Document Number RC",
header."Type",
CAST(det."ACCOUNT_ID (Bill-To)" AS CHAR(15)) AS "ACCOUNT_ID (Bill-To)",
CAST(det."ACCOUNT_ID (End-User)" AS CHAR(15)) AS "ACCOUNT_ID (End-User)",
CAST(det."ACCOUNT_ID (Pay-To)" AS CHAR(15)) AS "ACCOUNT_ID (Pay-To)",
CAST(det."ACCOUNT_ID (Ship-To)" AS CHAR(15)) AS "ACCOUNT_ID (Ship-To)",
CAST(det."ACCOUNT_ID (Sold-To)" AS CHAR(15)) AS "ACCOUNT_ID (Sold-To)",
det."ERP Order Number",
det."Order Number",
det."SAP Order Number",
det."Order Sale Type",
det."Quote Number and Revision",
SUM(det."Order Total Net Value") AS "Order Total Net Value",
SUM(det."Order Quantity") AS "Order Quantity"
FROM CRMCL_TOPICAREA.CPQ."Order_Details" det 
LEFT JOIN CRMCL_TOPICAREA.CPQ.ACTIVE_QUOTES_HEADER qheader ON det."Quote Number and Revision" = qheader."Quote Number and Revision" 
LEFT JOIN CRMCL_TOPICAREA.CPQ."Orders" header ON det."Quote Number and Revision" = header."Quote Number and Revision" 
WHERE qheader."ACCOUNT_ID" IN (
{0}
)
GROUP BY
qheader."ACCOUNT_ID",
header."Active (Flag)",
header."ISO",
header."Order Date",
header."Order Status",
header."Quote Number and Revision",
header."Sales Document Number RC",
header."Type",
"ACCOUNT_ID (Bill-To)",
"ACCOUNT_ID (End-User)",
"ACCOUNT_ID (Pay-To)",
"ACCOUNT_ID (Ship-To)",
"ACCOUNT_ID (Sold-To)",
det."ERP Order Number",
det."Order Number",
det."SAP Order Number",
det."Order Sale Type",
det."Quote Number and Revision"
'''.format(acc_subset_)
orders_frame = pd.read_sql(orders_sql,con=conn)

In [72]:
conn.close()

In [11]:
df = info[(info['database_name']=='CRMCL_TOPICAREA')&info['schema_name'].apply(lambda x: '_TEST' not in str(x))&info['table_name'].isin([
    'Accounts',
    'ACTIVE_QUOTES_HEADER',
    'Assets',
    'Leads',
    'OLIs',
    'Opportunities',
    'Order_Details',
    'Orders',
    'Revenue_New_Orders',
    'SDTB_IBD'
])][[
    'table_name', 
    'schema_name', 
    'column_name',
    'database_name'
]]

In [22]:
df[df['column_name'].isin([
    'ACCOUNT_ID',
    'OPPORTUNITY_ID',
    'OLI_ID',
    'Quote Number and Revision',
    'LEAD_ID'
])].sort_values(by=['column_name','table_name'])

Unnamed: 0,table_name,schema_name,column_name,database_name
685775,ACTIVE_QUOTES_HEADER,CPQ,ACCOUNT_ID,CRMCL_TOPICAREA
687904,Accounts,OSC,ACCOUNT_ID,CRMCL_TOPICAREA
688107,Assets,OSC,ACCOUNT_ID,CRMCL_TOPICAREA
688427,Leads,OSC,ACCOUNT_ID,CRMCL_TOPICAREA
688497,OLIs,OSC,ACCOUNT_ID,CRMCL_TOPICAREA
688795,Opportunities,OSC,ACCOUNT_ID,CRMCL_TOPICAREA
686948,Order_Details,CPQ,ACCOUNT_ID,CRMCL_TOPICAREA
686972,Orders,CPQ,ACCOUNT_ID,CRMCL_TOPICAREA
687370,Revenue_New_Orders,GCR,ACCOUNT_ID,CRMCL_TOPICAREA
692920,SDTB_IBD,SDTB,ACCOUNT_ID,CRMCL_TOPICAREA


In [20]:
df = info[info['database_name']=='CRMCL_TOPICAREA']

df['table_name'] = df[['schema_name','table_name']].apply(lambda x: x[0]+'.'+x[1],axis=1)
df = df[df['schema_name'].isin([
    'AUXFILES',
    'CNJ',
    'CPQ',
    'GCR',
    'OSC',
    'VARIAN'
])].drop(columns=['data_type','null?','kind','expression','comment','database_name','autoincrement','default','schema_name'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['table_name'] = df[['schema_name','table_name']].apply(lambda x: x[0]+'.'+x[1],axis=1)


In [21]:
df

Unnamed: 0,table_name,column_name
627244,AUXFILES.Account_Address_Geolocations,ACCOUNT_ID
627245,AUXFILES.Account_Address_Geolocations,Account Address
627246,AUXFILES.Account_Address_Geolocations,Account City
627247,AUXFILES.Account_Address_Geolocations,Account.Address Country
627248,AUXFILES.Account_Address_Geolocations,Account.Address accuracy
...,...,...
635303,VARIAN.VarianTerritories,Sub-Region
635304,VARIAN.VarianTerritories,Super-Territory
635305,VARIAN.VarianTerritories,TERRITORY_ID
635306,VARIAN.VarianTerritories,Territory


In [22]:
df0 = df.groupby(['column_name'],as_index=False).agg({'table_name':'||'.join})

In [23]:
df0['num_tables_with_key'] = df0['table_name'].apply(lambda x: len(str(x).split('||')))

In [24]:
df0 = df0.sort_values(by='num_tables_with_key',ascending=False)

In [25]:
df0[df0['num_tables_with_key']>1]

Unnamed: 0,column_name,table_name,num_tables_with_key
1,ACCOUNT_ID,AUXFILES.Account_Address_Geolocations||AUXFILE...,54
1784,SECURITY_ID,CNJ.CNJ_MarketShare||CPQ.ACTIVE_QUOTES_HEADER|...,45
1672,Quote Number and Revision,CPQ.ACTIVE_QUOTES_HEADER||CPQ.CS_QUOTEITEM_COS...,22
1002,Group Currency,CPQ.CS_QUOTEITEM_COSTS||CPQ.CS_Quote_Details||...,20
354,Business Line,AUXFILES.Manual Adjustments||CPQ.IN_VITRO_Addi...,18
...,...,...,...
509,Consumable Name,CPQ.IN_VITRO_Consumables_per_Enduser||CPQ.IN_V...,2
241,Asset Age (days),OSC.Assets||VARIAN.VarianAssets,2
238,Assay Name,AUXFILES.IN_VITRO_AssayNameMap_by_PartNumber||...,2
252,Asset Install Year,OSC.Assets||VARIAN.VarianAssets,2


In [31]:
df1 = df[df['column_name'].isin(df0[df0['column_name'].str.contains('_ID')]['column_name'].unique())]

In [112]:
rf = pd.DataFrame()

for ccol in df0[df0['column_name'].str.contains('_ID')]['column_name'].unique():
    connected_cols = df1[df1['table_name'].isin(df1[df1['column_name']==ccol]['table_name'].unique())]['column_name'].unique()
    ff = pd.DataFrame(connected_cols).rename(columns={0:'From'})
    ff['To'] = ccol
    rf = pd.concat([rf,ff])

In [113]:
rf['FromNEW'] = rf.apply(lambda x: sorted([x[0],x[1]])[0],axis=1)
rf['ToNEW'] = rf.apply(lambda x: sorted([x[0],x[1]])[1],axis=1)

In [114]:
rf = rf.drop(columns=['From','To']).rename(columns={'FromNEW':'From','ToNEW':'To'}).drop_duplicates()

In [115]:
rf = rf[~rf.apply(lambda x: x[0]==x[1],axis=1)]

In [116]:
rf

Unnamed: 0,From,To
1,ACCOUNT_ID,VAR_ACCOUNT_ID
2,ACCOUNT_ID,OPPORTUNITY_ID
3,ACCOUNT_ID,PROJECT_ID
4,ACCOUNT_ID,SECURITY_ID
5,ACCOUNT_ID,OLI_ID
...,...,...
9,EMPLOYEE_ID (Account Primary Sales Rep POC),EMPLOYEE_ID (Account Secondary Sales Rep In-Vivo)
8,EMPLOYEE_ID (Account Primary Sales Rep Ultraso...,EMPLOYEE_ID (Account Secondary Sales Rep In-Vi...
9,EMPLOYEE_ID (Account Primary Sales Rep Ultraso...,EMPLOYEE_ID (Account Secondary Sales Rep In-Vivo)
9,EMPLOYEE_ID (Account Secondary Sales Rep In-Vi...,EMPLOYEE_ID (Account Secondary Sales Rep In-Vivo)


In [118]:
G = nx.Graph()

In [122]:
G.add_edges_from(rf[['From','To']].apply(lambda x: (x[0],x[1]),axis=1).tolist())

In [None]:
G.degree