In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import pandas as pd
import numpy as np
import scipy.stats  as stats
import seaborn as sns
import glob
import matplotlib.pyplot as plt
from quality.analyze_types import create_typed_predictions, get_entity_node_degrees, create_combined_df, create_combined_over_embeddings, _get_files, average_node_degree

In [None]:
embedding_approaches = ["BootEA","MultiKE","RDGCN"]
vector_type = "SimAndEmb"
#dataset_name = "EN_DE_15K_V1"

In [None]:
def set_errors(df, pred, val):
    if (df["pred"] == pred) & (df["val"] == val):
        return 1
    else:
        return 0

In [None]:
def calc_measures(data):
    df = data.copy()
    df["fn"] = df.apply(set_errors, args=(0,1), axis=1)
    df["fp"] = df.apply(set_errors, args=(1,0), axis=1)
    df["tp"] = df.apply(set_errors, args=(1,1), axis=1)
    df["tn"] = df.apply(set_errors, args=(0,0), axis=1)
    summed = (df.groupby("left_types").sum() + df.groupby("right_types").sum()) / 2
    summed["prec"] = summed["tp"] / (summed["tp"]+summed["fp"])
    summed["rec"] = summed["tp"] / (summed["tp"]+summed["fn"])
    summed["fm"] = 2*(summed["prec"]*summed["rec"]/(summed["prec"]+summed["rec"]))
    return summed[["prec","rec","fm"]]

In [None]:
def show_table(dataset_name, color_axis=0, cache=True, scadsmb=False):
    pickle_path = f"/tmp/{dataset_name}_{color_axis}.pkl"
    if cache and os.path.exists(pickle_path):
        print("Load cached")
        final = pd.read_pickle(pickle_path)
    else:
        type_files = sorted([i for i in glob.iglob(f"/home/dobraczka/Downloads/git/er-embedding-benchmark/data/OpenEA/typed_links/datasets/{dataset_name}/721_5fold/*/typed_test")])
        type_dataset = "/home/dobraczka/Downloads/git/er-embedding-benchmark/data/OpenEA/typed_links/superclasses.json"
        if scadsmb:
            type_files = f"/home/dobraczka/Downloads/git/er-embedding-benchmark/data/EA-ScaDS-Datasets/ScadsMB/typed_links/datasets/{dataset_name}"
            type_dataset = "/home/dobraczka/Downloads/git/er-embedding-benchmark/data/ScadsMB/typed_links/superclasses.json"
        dfs = []
        occ = None
        measured = []
        for e in embedding_approaches:
            kg1_ent_id_files, kg2_ent_id_files, pred_files = _get_files(e, dataset_name, "/home/dobraczka/Downloads/git/er-embedding-benchmark/data/",vector_type)
            df = create_typed_predictions(
                    kg1_ent_id_files,
                    kg2_ent_id_files,
                    pred_files,
                    type_files,
                    type_dataset
            )
            if occ is None:
                occ = (df["left_types"].value_counts() + df["right_types"].value_counts()).to_frame("occurence")
        for d in dfs:
            measured.append(calc_measures(d).join(occ))
        mult = pd.concat(measured, axis = 1, keys=(["BootEA","MultiKE","RDGCN"]))
        final = mult.sort_index(axis=1, level=1).swaplevel(axis=1)
        final = final.reset_index().rename(columns={"left_types":"Type"})
        final["Type"] = [x.split("/")[-1].split("#")[-1] for x in final["Type"].astype(str)]
        final = final.set_index("Type").drop("index",level=0,axis=1).round(3)
        final = final.sort_values(by=("occurence","BootEA"))
        pd.to_pickle(final, pickle_path)
    return final.fillna(-0.0001).style.background_gradient(cmap='Greens',axis=color_axis,subset=["fm"]).background_gradient(cmap='Blues',axis=color_axis,subset=["prec"]).background_gradient(cmap='Purples',axis=color_axis,subset=["rec"])

# DBpedia-Wikidata 15K V1

In [None]:
show_table("D_W_15K_V1",)

# DBpedia-Wikidata 15K V2

In [None]:
show_table("D_W_15K_V2")

# DBpedia-Yago 15K V1

In [None]:
show_table("D_Y_15K_V1")

# DBpedia-Yago 15K V2

In [None]:
show_table("D_Y_15K_V2")

# DBpediaEN-DBpediaDE 15K V1

In [None]:
show_table("EN_DE_15K_V1") 

# DBpediaEN-DBpediaDE 15K V2

In [None]:
show_table("EN_DE_15K_V2") 

# DBpediaEN-DBpediaFR 15K V1

In [None]:
show_table("EN_FR_15K_V1") 

# DBpediaEN-DBpediaFR 15K V2

In [None]:
show_table("EN_FR_15K_V2")

# DBpedia-Wikidata 100K V1

In [None]:
show_table("D_W_100K_V1")

# DBpedia-Wikidata 100K V2

In [None]:
show_table("D_W_100K_V2")

# DBpedia-Yago 100K V1

In [None]:
show_table("D_Y_100K_V1")

# DBpedia-Yago 100K V2

In [None]:
show_table("D_Y_100K_V2")

# DBpediaEN-DBpediaDE 100K V1

In [None]:
show_table("EN_DE_100K_V1") 

# DBpediaEN-DBpediaDE 100K V2

In [None]:
show_table("EN_DE_100K_V2") 

# DBpediaEN-DBpediaFR 100K V1

In [None]:
show_table("EN_FR_100K_V1") 

# DBpediaEN-DBpediaFR 100K V2

In [4]:
show_table("EN_FR_100K_V2")

NameError: name 'show_table' is not defined

# IMDB-TMDB

In [None]:
show_table("imdb-tmdb", scadsmb=True)

# IMDB-TVDB

In [None]:
show_table("imdb-tvdb", scadsmb=True)

# TMDB-TVDB

In [None]:
show_table("tmdb-tvdb", scadsmb=True)