In [1]:
#  script to generate a dim=1024 embedding filled with random numbers to model as a comparison-baseline for bachelor thesis
# author: leonard Tiling


In [2]:
import pykeen
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import math

In [3]:
# 1. check out range from generated numbers 
# 2. generate numbers out of uniform (normalverteilung)
# 3. load connected-nodes from triple factory or connected-nodes-list
# 4. generate Dictonary and assign each node in the dict 1024 random values, save as Embedding_dict_random.feather
# savve dict and start evaluation with Embedding_dict_random.feather

In [3]:
# generate Numbers from uniform
print(np.random.uniform(-1, 1)) # all values equally chance 
mu, sigma = 0, 1 # mean and standard deviation
s = np.random.normal(mu, sigma, 10) # values from gaussian normal distribution
print(s)
rlist = [np.random.uniform(-1,1) for i in range(1024)]
print(len(rlist))

-0.800907937832261
[ 1.76592599  0.51838503  1.63390117  0.05110471 -1.29494637 -0.61260559
 -0.15761044 -0.03107318  0.88165734 -0.89677404]
1024


In [10]:
# load nodelist and assign via dictionary
nodes = pd.read_feather('/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/220208_graphembeddings/metadata/connected_nodes_list.feather')
print(nodes)

                   nodes
0          OMOP_21600001
1          OMOP_21600002
2          OMOP_21600003
3          OMOP_21600004
4          OMOP_21600005
...                  ...
700375  OT_R-HSA-6803207
700376   OT_R-HSA-416993
700377   OT_R-HSA-168181
700378  OT_R-HSA-3642279
700379  OT_R-HSA-9694676

[700380 rows x 1 columns]


In [None]:
random_dict = pd.DataFrame(dict(nodes= nodes['nodes'], embeddings=[ [np.random.uniform(-1,1) for i in range(1024)] for l in range(len(nodes))] ))
print(random_dict)

In [None]:
# create the random_embedding_dict with :
# values casted from float63 to float32
# sampled out of the normal gaussian distribution
# brought together into a dict, for every node 1024 random values

mu, sigma = 0, 1 # mean and standard deviation
s = np.random.normal(mu, sigma, 10) # values from gaussian normal distribution
ll = [np.random.normal(mu, sigma, 1024).astype(dtype='float32', casting='same_kind') for y in range(len(nodes['nodes'])) ] #

df_dict = pd.DataFrame(dict(nodes= nodes['nodes'], embeddings= ll)) #
df_dict.head()
#df_dict.to_feather('/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/220208_graphembeddings/embeddings_leonard/Embedding_dict_random_normal.feather')

[-0.17043212  0.71511939  0.1730594   0.84695051 -0.6626444   0.89136413
  1.21229807  0.49837665  0.53491023  1.07542677] <class 'numpy.float64'>
[-0.17043212  0.7151194   0.1730594   0.84695053 -0.6626444   0.89136416
  1.212298    0.49837664  0.5349102   1.0754268 ] <class 'numpy.float32'>


In [5]:
# load RotatE embedding and inspect range of values
df_rotatE = pd.read_feather('/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/220208_graphembeddings/embeddings_leonard/Embedding_dict_RotatE.feather')
print(df_rotatE.head())

df_ConvE =pd.read_feather('/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/220208_graphembeddings/embeddings_leonard/Embedding_dict_ConvE.feather')
df_ConvE.head()

          nodes                                         embeddings
0  OMOP_1000560  [0.14839451, -0.14838076, -0.121275395, 0.1185...
1  OMOP_1000577  [-0.20172451, -0.20994017, 0.15138729, 0.07870...
2  OMOP_1000579  [0.026997318, -0.25478494, 0.17429298, -0.1326...
3  OMOP_1000599  [0.07367588, -0.09364811, -0.0073877256, 0.136...
4  OMOP_1000600  [-0.18172097, -0.2223067, -0.03936057, -0.1386...


Unnamed: 0,nodes,embeddings
0,OMOP_1000560,"[0.19152974, -0.012642919, 0.0087412875, -0.11..."
1,OMOP_1000577,"[-0.058635775, -0.10710192, 0.040898025, 0.009..."
2,OMOP_1000579,"[-0.16844621, -0.13485488, -0.10257045, 0.0068..."
3,OMOP_1000599,"[-0.15834698, 0.06615062, 0.06974961, -0.07723..."
4,OMOP_1000600,"[-0.17353916, 0.057508957, -0.1200872, -0.0007..."


In [6]:
# inspecting the dataframe
print(df_rotatE.shape, ' dtype:',df_rotatE.dtypes,' empty?:', df_rotatE.empty,' size:', df_rotatE.size, '\n')
df_rotatE.info()

print('###############################################')
#print(df_rotatE['embeddings'][0])
big_np = np.asarray(df_rotatE['embeddings'])
big_np[0]
#big_m = np.asmatrix(big_np)
print(np.std(big_np[0]), np.mean(big_np[0]))

(700380, 2)  dtype: nodes         object
embeddings    object
dtype: object  empty?: False  size: 1400760 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700380 entries, 0 to 700379
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   nodes       700380 non-null  object
 1   embeddings  700380 non-null  object
dtypes: object(2)
memory usage: 10.7+ MB
###############################################
0.13381132 -0.0070063


In [None]:
# try plotting
# Plot Histogram on x
x = np.random.normal(size = 1000)
#plt.hist(big_np[3], bins=200)
#plt.gca().set(title='Frequency Histogram of embedding values', ylabel='Frequency');


#define subplots
fig, ax = plt.subplots(10, 10, figsize=(25,20))
fig.tight_layout()
#create subplots
for i in range(10):
    for j in range(10):
        ax[i][j].hist(big_np[i+j], bins=200)
        
plt.suptitle('Frequency Histogram of embedding values', y=1.05, size=16)
#plt.savefig('Normalverteilung RotatE.png', dpi=400)

In [7]:
# evaluate each sd and mean per embedding 
means=[]
stds=[]
print(np.std(big_np[0]), np.mean(big_np[0]))
for i in range(len(big_np)):
    means.append(np.mean(big_np[i]))
    stds.append(np.std(big_np[i]))
print(len(stds), len(means))

0.13381132 -0.0070063
700380 700380


In [8]:
# plot mens and stds : 
print(type(means), type(means[0]), means[0], np.mean(stds), np.std(stds))
#plt.hist(means, bins=200)
#plt.show()
#plt.hist([np.random.normal(np.mean(stds), np.std(stds), len(means))], bins=200)
#plt.show()

def potenzsum(stds):
    s=0
    for i in range(len(stds)):
        s = s+stds[i]**2
    return s
#calculate average std out of equal sample size: https://www.statology.org/averaging-standard-deviations/
def average_sdt(stds):
    return math.sqrt((potenzsum(stds) / len(stds)))

avg_std = average_sdt(stds)
print(avg_std)

<class 'list'> <class 'numpy.float32'> -0.0070063 0.14246032 0.006859155
0.14262535238676427


In [13]:
# generate random values with mean of means and std of means to random_rotatE.feather
random_rot = [np.random.normal(0, avg_std, 1024).astype(dtype='float32', casting='same_kind') for y in range(len(nodes['nodes'])) ] #
df_rotate = pd.DataFrame(dict(nodes= nodes['nodes'], embeddings=random_rot)) #
df_rotate.to_feather('/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/220208_graphembeddings/embeddings_leonard/Embedding_dict_random_rotate4.feather')
df_rotate.head()

Unnamed: 0,nodes,embeddings
0,OMOP_21600001,"[0.059952356, 0.33628994, 0.15441476, 0.014831..."
1,OMOP_21600002,"[-0.022689564, -0.112952635, -0.14282987, 0.01..."
2,OMOP_21600003,"[-0.079462305, -0.24657337, 0.02037129, -0.131..."
3,OMOP_21600004,"[0.13588604, -0.027177274, 0.23015128, -0.0614..."
4,OMOP_21600005,"[0.036219828, -0.08621131, 0.15677688, -0.0843..."
