In [18]:
import os
import numpy as np
import pandas as pd
import glob
from sklearn.decomposition import PCA
from bokeh.models import HoverTool, CategoricalColorMapper,CustomJS
from bokeh.plotting import figure, output_file, show, ColumnDataSource, output_notebook, gridplot
from bokeh.models.glyphs import Circle
output_notebook()

In [36]:
def load_kg_ents(path, ent_links, source):
    rows = []
    with open(path) as in_file:
        for line in in_file:
            tup = line.strip().split("\t")
            try:
                rows.append([tup[0],tup[1],source,ent_links[tup[0]]])
            except KeyError:
                #little hack because we forgot to put types into ent_links
                if tup[0].startswith("http:") and "scads" not in tup[0]:
                    # in our datasets type entities are the same in both
                    rows.append([tup[0],tup[1],source,tup[0]])
            except IndexError:
                # dont know what happens here
                print("This line is fishy?")
                print(line)
    return rows

In [3]:
def load_ent_links(ent_links_path):
    ent_links = dict()
    with open(ent_links_path) as in_file:
        for line in in_file:
            tup = line.strip().split("\t")
            ent_links[tup[0]] = tup[1]
            ent_links[tup[1]] = tup[0]
    return ent_links

In [4]:
def create_gs(name, ent_links):
    pair = []
    for n in name:
        pair.append(ent_links[n])
    return pair

In [27]:
def plot_embeddings(embed_base_path, ent_link_path,source_names=["DBpedia","Wikidata"]):
    ent_links = load_ent_links(ent_link_path)
    ents = load_kg_ents(embed_base_path + "/kg1_ent_ids",ent_links,source_names[0])
    ents.extend(load_kg_ents(embed_base_path + "/kg2_ent_ids",ent_links,source_names[1]))
    emb = np.load(embed_base_path + "/ent_embeds.npy")
    df = pd.DataFrame(ents,columns=["name","id","source","pair"])
    df.index = pd.to_numeric(df["id"])
    df.drop("id",axis=1,inplace=True)
    df.sort_index(inplace=True)
    emb2d = PCA(n_components=2).fit_transform(emb)
    pca = pd.DataFrame(emb2d,columns=["pca1","pca2"])
    pca.index.set_names("id",inplace=True)
    df = df.join(pca)
    merged = df.merge(df, left_on="name",right_on="pair")
    merged = merged[merged["source_x"]==source_names[0]]
    
    #now the fun part with bokeh
    ds = ColumnDataSource(merged)
    hover_left = HoverTool(
        tooltips=[
            ("name", "@name_x"),
            ("pair","@name_y")
        ]
    )
    hover_right = HoverTool(
        tooltips=[
            ("name", "@name_y"),
            ("pair","@name_x")
        ]
    )
    # if selected only color selected
    selection_glyph_left = Circle(fill_color='blue', line_color=None)
    selection_glyph_right = Circle(fill_color='orange', line_color=None)
    nonselection_glyph = Circle(fill_color='gray', fill_alpha=0.1, line_color=None)
    
    # create a new plot and add a renderer with appropriate glyph handling
    left = figure(tools=["tap","box_select",hover_left], width=400, height=400, title=None)
    r_left = left.circle('pca1_x', 'pca2_x', source=ds,fill_color="blue",fill_alpha=.2,line_color=None,legend_label=source_names[0])
    r_left.selection_glyph = selection_glyph_left
    r_left.nonselection_glyph = nonselection_glyph

    # create another new plot and add a renderer with appropriate glyph handling
    right = figure(tools=["tap","box_select",hover_right], width=400, height=400, title=None)
    r_right = right.circle('pca1_y', 'pca2_y', source=ds,color="orange",fill_alpha=.2,line_color=None,legend_label=source_names[1])
    r_right.selection_glyph = selection_glyph_right
    r_right.nonselection_glyph = nonselection_glyph
    
    #callback to highlight in both plots
    callback = CustomJS(args = dict(source = ds, plots = [left,right]), code = "source.selected.indices = [source.selected.indices[0]];")
    left.js_on_event('tap', callback)
    right.js_on_event('tap', callback)
    left.js_on_event('lasso', callback)
    right.js_on_event('lasso', callback)



    p = gridplot([[left, right]])

    show(p)

In [39]:
data_path = "../../data/"
plot_embeddings(glob.glob(f"{data_path}Embeddings15K/MultiKE/D_Y_15K_V1/721_5fold/1/*/")[0],data_path + "OpenEA/D_Y_15K_V1/ent_links")

In [35]:
plot_embeddings(glob.glob(f"{data_path}Embeddings15K/MultiKE/imdb-tmdb/721_5fold/1/*/")[0],data_path + "EA-ScaDS-Datasets/ScadsMB/imdb-tmdb/ent_links",source_names=["imdb","tmdb"])

In [33]:
plot_embeddings(glob.glob(f"{data_path}Embeddings15K/MultiKE/dblp-scholar/721_5fold/1/*/")[0],data_path + "EA-ScaDS-Datasets/dblp-scholar/ent_links",source_names=["dblp","scholar"])

	17



In [33]:
plot_embeddings("Downloads/git/unsupervised-entity-alignment/results/",data_path + "EA-ScaDS-Datasets/dblp-scholar/ent_links",source_names=["dblp","scholar"])

	17

