If you're running this notebook locally on your own machine, you can ignore the following cell. If you want to [run this notebook remotely on Google's Colab platform](https://colab.research.google.com/github/tsafavi/codex/blob/master/Explore%20CoDEx.ipynb), uncomment and run the lines in the following cell: 

In [None]:
# !git clone https://github.com/tsafavi/codex
# %cd codex
# !./extract.sh en  # unzip all KG text in English
# !pip install -r requirements.txt
# !pip install -e .

If you're running on Colab, you will need to restart the runtime after running the previous cell.

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from codex.codex import Codex

In [None]:
sizes = Codex.SIZES
codes = Codex.CODES

# Data exploration
Load our data and do basic exploratory analysis.

In [None]:
for size in sizes:
    codex = Codex(size=size)
    
    train, valid, test = [
        codex.split(split) for split in ("train", "valid", "test")]
    triples = codex.triples()
    
    print(codex.name())
    
    print(
        " ", len(codex.entities()), "entities /",
        len(codex.relations()), "relations"
    )
        
    print(
        " ", len(train), "train /",
        len(valid), "validation /",
        len(test), "test"
    )
    print(" ", len(triples), "total triples")

Inspect a single entity.

In [None]:
eid = "Q51"

for code in codes:
    codex = Codex(code=code)
    print(codex.entity_label(eid))

In [None]:
codex = Codex(code="en")
print(f"From {codex.entity_wikipedia_url(eid)}:")
print(f"  '{codex.entity_extract(eid)}'")

Explore entity types.

In [None]:
codex = Codex(code="en")
types = codex.entity_types(eid)
for etype in types:
    print(codex.entity_label(eid), "is of type", codex.entity_type_label(etype))

In [None]:
type_id = "Q5"
print(f"From {codex.entity_type_wikipedia_url(type_id)}:")
print(f"  '{codex.entity_type_extract(type_id)}'")

Inspect a single relation.

In [None]:
rid = "P530"
print(codex.relation_label(rid) + ":", codex.relation_description(rid))

Explore some of the negative triples.

In [None]:
for size in ("s", "m"):
    codex = Codex(size=size)
    valid_neg, test_neg = (
        codex.negative_split("valid"),
        codex.negative_split("test")
    )
    print(
        codex.name(), "has", 
        len(valid_neg), "validation negatives and",
        len(test_neg), "test negatives"
    )
    
    print("Samples:")
    sample = test_neg.sample(n=5)

    rows = [(tuple(row)) for row in sample.values]
    for h, r, t in rows:
        print(" ",
              f"({codex.entity_label(h)},",
              f"{codex.relation_label(r)},", 
              f"{codex.entity_label(t)})")

Make sure none of the negative triples are actually positives.

In [None]:
all_triples = {tuple(row) for row in pd.concat(
    [Codex(size=size).triples() for size in sizes]).values}
    
for size in ("s", "m"):
    codex = Codex(size=size)
    valid_neg, test_neg = (
        codex.negative_split("valid"),
        codex.negative_split("test")
    )
    
    for neg_triples in (valid_neg, test_neg):
        neg_triples = {tuple(row) for row in neg_triples.values}
        assert not len(neg_triples.intersection(all_triples))
        
print("Passed all checks successfully")

# Plotting
Plot various properties like degree distribution and top-k entities/relations.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import logging
logging.getLogger("matplotlib.font_manager").disabled = True

# Styles from https://scentellegher.github.io/visualization/2018/10/10/beautiful-bar-plots-matplotlib.html

plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["font.sans-serif"] = "Helvetica"

plt.rcParams["axes.linewidth"] = 0.8
plt.rcParams["legend.frameon"] = False

dark_grey = "#333F4B"
plt.rcParams["text.color"] = dark_grey
keys = [
    "axes.edgecolor",
    "axes.labelcolor",
    "xtick.color",
    "ytick.color"
]
for key in keys:
    plt.rcParams[key] = dark_grey

palette = sns.color_palette("RdBu", 10)
# sns.palplot(palette)

Plot the degree distribution of CoDEx datasets.

In [None]:
def count_entities(triples):
    """Get the degree of all entities across train/valid/test"""
    entities = pd.concat((triples["head"], triples["tail"]))
    df = pd.DataFrame(
        entities.value_counts().reset_index(),
    )
    df.columns = ["entity", "count"]
    return df

def count_relations(triples):
    """Get relation mention counts"""
    df = pd.DataFrame(
        triples["relation"].value_counts().reset_index(),
    )
    df.columns = ["relation", "count"]
    return df

In [None]:
def plot_degree_distribution(
        triples, step=5, ax=None, 
        figsize=(5,3), **kwargs):
    """Plot the degree distribution of all entities
    in a set of triples"""
    counts = count_entities(triples)["count"]
    print(f"Average degree: {counts.mean():.2f}")
    
    x = np.arange(0, max(counts), step)
    y = [len(counts[counts > val]) for val in x]
    y = np.array(y, dtype=np.float32)

    if not ax:
        fig, ax = plt.subplots(figsize=figsize)
    
    ax.scatter(x, y, alpha=0.5, **kwargs)
    ax.set_xscale("symlog")
    ax.set_yscale("symlog")
    
    ax.set_xlabel("Entity degree", fontsize=14)
    ax.set_ylabel("Cumulative count", fontsize=12)
    
    ax.tick_params(
        which="both",
        labelsize=12
    )
    
    ax.spines["top"].set_color("none")
    ax.spines["right"].set_color("none")
    ax.spines["left"].set_smart_bounds(True)
    ax.spines["bottom"].set_smart_bounds(True)

    return ax

In [None]:
for size in sizes:
    codex = Codex(size=size)
    triples = codex.triples()

    ax = plot_degree_distribution(
        triples, 
        color=palette[-1],
        step=5,
        figsize=(4, 2),
        label=codex.name(),
    )

    legend = ax.legend(
        frameon=False, 
        fontsize=14,
        handletextpad=0.001,
        labelspacing=0.2,
        loc=3
    )

    for text in legend.get_texts():
        text.set_color(dark_grey)

    for lh in legend.legendHandles: 
        lh.set_alpha(0.7)

    xlim = ax.get_xlim()
    ax.set_xlim(3, max(xlim))

    plt.show()

Plot the top-k entities and relations in a CoDEx dataset.

In [None]:
def plot_top_k(count_df, k=30, 
               label_col="label",
               count_col="count",
               figsize=(5,8.5),
               color="#007acc",
               ax=None,
               **kwargs):
    count_df = count_df.sort_values(count_col, ascending=False).head(k)
    top_labels, top_counts = count_df[label_col], count_df[count_col]
    
    if ax is None:
        fig, ax = plt.subplots(figsize=figsize)

    x = np.arange(k)
    y = top_counts[::-1]
        
    ax.hlines(
        y=x,
        xmin=0, 
        xmax=y, 
        color=color, 
        alpha=0.2, 
        linewidth=5,
        **kwargs
    )
    
    ax.plot(
        y, x, "o", 
        markersize=7, 
        color=color, 
        alpha=0.6)
    
    ax.spines["top"].set_color("none")
    ax.spines["right"].set_color("none")

    ax.spines["left"].set_bounds(0, k - 1)
    ax.spines["bottom"].set_bounds(0, max(ax.get_xticks()))
    
    ax.set_xlabel("Count", fontsize=14)

    ax.set_yticks(x)
    ax.set_yticklabels(top_labels[::-1])
    
    return ax

In [None]:
size = "l"
codex = Codex(size=size)

triples = codex.triples()
count_df = count_entities(triples)

count_df["label"] = [
    codex.entity_label(eid) for eid in count_df["entity"]]

k = 30

ax = plot_top_k(
    count_df, 
    k=k, 
    color=palette[-1],
    linewidths=6
)

ax.set_xscale("linear")
ax.set_xlabel("Entity degree", fontsize=14)
ax.set_title(codex.name(), fontsize=16)
ax.tick_params("x", labelsize=12)

plt.tight_layout()
plt.show()

In [None]:
count_df = count_relations(triples)
count_df["label"] = [
    codex.relation_label(rid) for rid in count_df["relation"]]

k = 15

ax = plot_top_k(
    count_df, 
    k=k, 
    color=palette[-1],
    linewidths=6,
    figsize=(5, 4)
)

ax.set_xscale("linear")
ax.set_xlabel("Mention count", fontsize=14)
ax.set_title(codex.name(), fontsize=16)
ax.tick_params("x", labelsize=12)

plt.tight_layout()
plt.show()

Investigate symmetry.

In [None]:
def entity_pairs(triples, reverse=False):
    """Get all unique head/tail entity pairs in a set of triples"""
    col1, col2 = "head", "tail"
    if reverse:
        col1, col2 = col2, col1
    return set(zip(triples[col1], triples[col2]))

In [None]:
threshold = 0.5

for size in sizes: 
    codex = Codex(size=size)
    print("----------" + codex.name() + "----------")
    relations = codex.relations()
    triples = codex.triples()
    tot = 0
    
    for relation in relations:
        triples_r = triples[triples["relation"] == relation]
        
        forward, reverse = entity_pairs(triples_r), \
            entity_pairs(triples_r, reverse=True)
        symmetry = len(forward.intersection(reverse)) / len(forward)
        percent = len(triples_r) / len(triples)
        
        if symmetry > threshold:
            print(
                f"  {codex.relation_label(relation)} ({relation})", 
                f"| {symmetry * 100:.2f}% symmetry", 
                f"| {percent * 100:.3f}% of {codex.name()}"
            )
            
            tot += (percent * 100)
                
    print(f"Symmetric relations > {threshold:.1f} make up {tot:.2f}% of {codex.name()}")

Investigate compositionality.

In [None]:
import os

In [None]:
def rule_relations(rule):
    return [item for item in rule.split(" ") if item.startswith("P")]

In [None]:
for size in sizes:
    codex = Codex(size=size)
    name = codex.name()
    print("----------" + name + "----------")
    
    rule_df = pd.read_csv(
        os.path.join("analysis/paths", f"codex-{size}", "paths.tsv"), sep="\t"
    )
        
    # Count number of paths in this core
    # Get the average confidence across all rules
    print(
        len(rule_df), "paths in", name
    )
    
    avg_conf = rule_df["PCA Confidence"].mean()
    print(f"Average confidence: {avg_conf:.3f}")
    
    frac_of_total = (
        np.sum(np.multiply(rule_df["Positive Examples"], rule_df["Length"])) / 
        len(codex.triples()) * 100
    )
    print(f"n pos / n triples: {frac_of_total:.2f}%")
        
    # Print a few examples
    print("Examples:")
    for length in (2, 3):
        df = rule_df[rule_df["Length"] == length]
        sample = df.sample(n=1)
        rule, conf = sample["Rule"].iloc[0], sample["PCA Confidence"].iloc[0]
        relations = rule_relations(rule)
        
        print(
            " ",
            "/".join([codex.relation_label(relation) for relation in relations[:-1]]), 
            "->", 
            codex.relation_label(relations[-1]),
            f"(confidence {conf:.3f})"
        )