In [5]:
# General Imports

import numpy as np
from numpy import random as rnd
from matplotlib import pyplot as plt
import os,sys,datetime,time,math, warnings,itertools

import pandas as pd

# Snowpark imports
from snowflake.snowpark.session import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark.window import Window

# Sklearn and preprocessing imports
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec

# Visualization
from sklearn.manifold import TSNE
from plotly import express as px
from seaborn import lmplot

In [6]:
# Connection Parameters
connection_parameters = {
    'account':'shsitdl.west-europe.azure',
    'user':'jan-lucas.deinhard@siemens-healthineers.com',
    'authenticator':'externalbrowser',
    'role':'FR_CRMCLOUD_DEV',
    'database':'ACCESSLAYER',
    'schema':'AC_CRMCLOUD_APP_PRICINGANALYTICS_P',
    'warehouse':'W_CRMCLOUD_SMALL_P'
}

# Establish Connection
session = Session.builder.configs(connection_parameters).create()

cC = session.table("\"In Vivo Quote Pricing Overview Systems (PCI)\"")

cC_sampled = cC.sample(n=10000)
df = pd.DataFrame(cC_sampled.collect())

session.close()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://login.microsoftonline.com/5dbf1add-202a-4b8d-815b-bf0fb024e033/saml2/?SAMLRequest=nZJfb9owFMW%2FSuQ9J3b%2BsDELqGhpBxLbEIRW4s1JbsDDsTPbIWWffk4AqXtoH%2FZm2efe3%2FE9d3T3WgnvBNpwJccoDAjyQOaq4HI%2FRtv0yR8iz1gmCyaUhDE6g0F3k5FhlajptLEHuYbfDRjruUbS0O5hjBotqWKGGypZBYbanG6m35c0CghlxoC2DoeuJYXhjnWwtqYYt20btHGg9B5HhBBMvmKn6iSf0BtE%2FTGj1sqqXIlbyav70zuIEJOkQziFI6yuhfdcXkbwESW7iAydp%2BnKX%2F3cpMib3n73oKRpKtAb0Ceew3a9vBgwzsFmvlmks2XQurn50GhVQ8D%2BNBoCI1VbCnaEXFV1Y133wJ1wCQUWas%2FdzBazMaqPvBhY8nKUhD3uQPD4FFWH3Ys%2BPer1oByQ3fo83JL5Nvn2fP9rmyPv%2BZZw1CW8MKaBhexyte6KRLEfRn74OY0iGic0SYLh8MsOeTPnj0tm%2B8qb%2Bd5HUPFcK6NKq6TgEnqXgyIrQ1YUvuvI%2FCQbFv4wHGR%2BVpIyI1ECJI5xl16E0WWFaO9ET%2F53MCP8tst1K3%2B4oBazlRI8P3tPSlfMvp9jGIT9DS%2F8spdSqBgX06LQYIzLUwjVPmhg1i2

In [7]:
df['Funnel Value (LC)'] = df['Funnel Value (LC)'].astype(np.double)

In [8]:
coldict = {}

for ccol in df.columns.to_list():
    coldict[ccol]=df[ccol].dtype

In [9]:
encoding_cols = []
standardscaler_cols = []

for ccol in coldict.keys():
    if coldict[ccol]=='object' or coldict[ccol]=='datetime64[ns]':
        encoding_cols.append(ccol)
    else:
        standardscaler_cols.append(ccol)

In [10]:
encoding_cols_value_counts = {}
for_label_encoding = []

for ccol in encoding_cols:
    encoding_cols_value_counts[ccol] = df[ccol].nunique()
    if df[ccol].nunique()<50:
        for_label_encoding.append(ccol)

In [11]:
ef = df.copy()

In [12]:
for ccol in for_label_encoding:
    enc = LabelEncoder()
    ef[ccol] = enc.fit_transform(ef[ccol].values)

In [13]:
for ccol in standardscaler_cols:
    enc = StandardScaler()
    ef[ccol] = enc.fit_transform(ef[ccol].values.reshape(-1,1))

stringcols_highcard = [k for k in list(coldict.keys()) if not ((k in for_label_encoding) or (k in standardscaler_cols))]

In [14]:
corpus = []

for ccol in stringcols_highcard:
    corpus += list(df[ccol].values)
corpus = list(set(corpus))

tokenized_corpus = []

for word in corpus:
    tokenized_corpus.append(str(word).lower().split(' '))

In [15]:
model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=5,
    window=5,
    min_count=1,
    workers=4
)

word_vectors = model.wv

In [16]:
def tryencode(x):
    try:
        return word_vectors[str(x).lower().split(' ')][0] 
    except: return np.array([0,0,0,0,0])

In [17]:
for ccol in stringcols_highcard:
    X = np.vstack(ef[ccol].apply(lambda x: tryencode(x)).values)
    ef = ef.assign(**pd.DataFrame(X).add_prefix(ccol+'_')).drop(columns=[ccol])

In [18]:
coldict2 = {}

for ccol in ef.columns.to_list():
    coldict2[ccol]=ef[ccol].dtype

In [19]:
D = ef.fillna(-1).values

tsne = TSNE(n_components=2,random_state=42)
E = tsne.fit_transform(D)

In [20]:
px.scatter(x=E[:,0].reshape(-1),y=E[:,1].reshape(-1))

In [23]:
from ripser import ripser

In [1]:
homology_result = ripser(E)

NameError: name 'ripser' is not defined