# Snowflake Data Exploration

The goal of this notebook is to support the design of a research hypothesis suitable for the CRM Data Cloud.

In [65]:
import numpy as np
from numpy import random as rnd
from matplotlib import pyplot as plt
import warnings,datetime,time,math,itertools,os,sys

import torch
import torch.nn as nn
from torch.nn import functional as F

import pandas as pd
import plotly as pl

from sklearn.preprocessing import LabelEncoder

import networkx as nx

import snowflake.connector

In [2]:
sys.path.append('./../7_HELPERFUNCTIONS/')
from WorldSimulators.CRMDatabase import CRMDB

## Fetch DB information

Build connection framework for pulling data from Snowflake. Fetch Metadata Information.

In [8]:
conn = snowflake.connector.connect(
    user='jan-lucas.deinhard@siemens-healthineers.com',
    account='shsitdl.west-europe.azure',
    authenticator='externalbrowser'
)

In [9]:
# CRM Data Cloud Metadata
info = pd.read_sql("SHOW COLUMNS",con=conn)

In [10]:
conn.close()

## Instantiate CRM Data Base from class

Get the CRM Database class and populate. Dig into the structure.

In [76]:
db = CRMDB()

## Summarize data into characteristic vector

Create a summary vector v for every DB row, excluding key elements. Examine data frames based only on keys and characteristic vectors.

In [77]:
for cframe in db.frames.keys():
    # Select current frame
    tdf_init = db.frames[cframe]
    tdf = db.frames[cframe].copy()
    # Iterate over non-key columns in current frame
    for ccol in tdf[[k for k in tdf.columns.tolist() if k not in db.metadata[db.metadata['ISKEY']=='Y'].COLUMN.tolist()]].select_dtypes(include='object').columns:
        lenc = LabelEncoder() 
        tdf[ccol] = lenc.fit_transform(tdf[ccol])
    # Store full vector as new frame entry
    tdf_init['v'] = tdf[[k for k in tdf.columns.tolist() if k not in db.metadata[db.metadata['ISKEY']=='Y'].COLUMN.tolist()]].apply(lambda x: x.values,axis=1)

In [79]:
L = [k for k in db.frames['Accounts'].columns.tolist() if k in db.metadata[db.metadata['ISKEY']=='Y'].COLUMN.tolist()]
L.append('v')

In [81]:
df = db.frames['Accounts'][L]

In [82]:
df

Unnamed: 0,ACCOUNT_ID,ACCOUNT_ID (MainAccount),ACCOUNT_ID (Parent),v
0,100000249062544,100000249062371,100000249062371,"[789, 5, 1197, 5, 906, 4, 0, 0, 1, 1265, 0, 15..."
1,100001109138434,920010018478466,100001109135014,"[901, 4, 703, 4, 108, 4, 0, 0, 9, 539, 0, 912,..."
2,100000416312069,910010321446211,910010321446211,"[554, 6, 604, 1, 903, 4, 0, 0, 8, 1356, 0, 399..."
3,100000416655710,910010895254685,910010895254685,"[330, 6, 742, 9, 216, 3, 0, 0, 9, 1708, 0, 176..."
4,100000416479488,910011374025656,910011374025656,"[355, 6, 123, 6, 542, 4, 0, 0, 9, 1805, 0, 563..."
...,...,...,...,...
1822,100000415397891,910010895254685,910010895254685,"[87, 6, 1130, 9, 188, 4, 0, 0, 9, 1630, 0, 78,..."
1823,100000415398800,910010321446211,910010321446211,"[495, 6, 663, 1, 232, 4, 0, 0, 9, 1621, 0, 117..."
1824,100001109135433,920010018478466,100001109127996,"[897, 4, 577, 4, 770, 4, 0, 0, 3, 369, 0, 812,..."
1825,100000416163326,910009252171563,910009252171563,"[141, 6, 810, 7, 670, 3, 0, 0, 3, 1350, 0, 342..."
