In [1]:
import sys
from pathlib import Path

if str(Path.cwd().parent) not in sys.path:
    sys.path.insert(0, str(Path.cwd().parent))

import json
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import numpy as np

from models.entities import EntityType
from nlp.candidates.filter import PairFilter, TypeConfig
from nlp.candidates.npmi import NPMI
from nlp.candidates.collocation import Collocation
from nlp.relations.explicit import Explicit
from nlp.relations.implicit import ImplicitCluster
from nlp.relations.embeddings import EntityEmbeddings
from nlp.entity_filter import filter_pipeline, FilterConfig
from nlp.syntactic import nlp
from config.llm import GeminiConfig


In [2]:
arxiv_id = "1810.04805"
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

gemini_config = GeminiConfig()

In [3]:
full_text_path = output_dir / f"{arxiv_id}_full_text.txt"
full_text = full_text_path.read_text()

sentences = full_text.split("\n\n")
sentences = [s.strip() for s in sentences if s.strip()]

print(f"Total sentences: {len(sentences)}")
print(f"Average sentence length: {np.mean([len(s) for s in sentences]):.0f} chars")


Total sentences: 26
Average sentence length: 1446 chars


In [4]:
with open(output_dir / f"{arxiv_id}_all_entities.json") as f:
    all_entities = json.load(f)

print(f"Total entities: {len(all_entities)}")

type_counts = {}
for e in all_entities:
    t = e['type']
    type_counts[t] = type_counts.get(t, 0) + 1

print("Entity type distribution:")
for t in sorted(type_counts.keys()):
    print(f"  {t}: {type_counts[t]}")

Total entities: 459
Entity type distribution:
  dataset: 88
  method: 76
  metric: 15
  other: 200
  task: 80


In [5]:
doc = nlp(full_text)

filter_config = FilterConfig()
filtered_entities = filter_pipeline(all_entities, doc, filter_config)

print(f"Filtered entities: {len(filtered_entities)}")

type_counts_filtered = {}
for e in filtered_entities:
    t = e['type']
    type_counts_filtered[t] = type_counts_filtered.get(t, 0) + 1

print("Filtered entity type distribution:")
for t in sorted(type_counts_filtered.keys()):
    print(f"  {t}: {type_counts_filtered[t]}")


Filtered entities: 79
Filtered entity type distribution:
  dataset: 50
  method: 11
  metric: 2
  task: 16


In [6]:
type_config = TypeConfig()
pair_filter = PairFilter(type_config)

candidates = pair_filter.generate(filtered_entities)

print(f"Type-filtered candidates: {len(candidates)}")

candidates_output = [{"e1": e1, "e2": e2} for e1, e2 in candidates]
with open(output_dir / f"{arxiv_id}_candidates_typed.json", "w") as f:
    json.dump(candidates_output, f, indent=2)


Type-filtered candidates: 1603


In [7]:
npmi_calc = NPMI(tau=0.0, min_cooc=1)
npmi_candidates = npmi_calc.select(filtered_entities, sentences)

entity_dict = {e['text']: e for e in filtered_entities}
colloc_filter = Collocation(min_cooc=1, max_sent_dist=5)
colloc_candidates = colloc_filter.filter(npmi_candidates, entity_dict)

print(f"NPMI candidates: {len(npmi_candidates)}")
print(f"After collocation filter: {len(colloc_candidates)}")

if colloc_candidates:
    npmi_vals = [c.npmi for c in colloc_candidates]
    print(f"NPMI range: {min(npmi_vals):.3f} - {max(npmi_vals):.3f}")


NPMI candidates: 590
After collocation filter: 129
NPMI range: 0.019 - 1.000


In [8]:
colloc_candidates.sort(key=lambda x: -x.npmi)

high_conf = [c for c in colloc_candidates if c.npmi >= 0.40]
medium_conf = [c for c in colloc_candidates if 0.20 <= c.npmi < 0.40]
low_conf = [c for c in colloc_candidates if c.npmi < 0.20]

print(f"High confidence (≥0.40): {len(high_conf)}")
print(f"Medium confidence (0.20-0.40): {len(medium_conf)}")
print(f"Low confidence (<0.20): {len(low_conf)}")

top_candidates = [(c.e1, c.e2) for c in high_conf + medium_conf[:10]]

print(f"Total top candidates: {len(top_candidates)}")

top_ranked = [
    {"e1": c.e1, "e2": c.e2, "npmi": c.npmi, "cooc": c.n_cooc}
    for c in high_conf + medium_conf[:10]
]

with open(output_dir / f"{arxiv_id}_ranked_candidates.json", "w") as f:
    json.dump(top_ranked, f, indent=2)


High confidence (≥0.40): 50
Medium confidence (0.20-0.40): 37
Low confidence (<0.20): 42
Total top candidates: 60


In [9]:
sentence_strings = [s.strip() for s in full_text.split("\n\n") if s.strip()]

candidates_dicts = [{"e1": e1, "e2": e2} for e1, e2 in top_candidates]

print(f"Sentences prepared: {len(sentence_strings)}")
print(f"Candidates prepared: {len(candidates_dicts)}")


Sentences prepared: 26
Candidates prepared: 60


In [10]:
explicit_extractor = Explicit(gemini_config)

explicit_relations = explicit_extractor.discover(
    candidates_dicts,
    sentence_strings
)

print(f"Explicit relations discovered: {len(explicit_relations)}")

for rel in explicit_relations[:5]:
    print(f"  {rel.e1} --[{rel.rel_type}]--> {rel.e2}")
    print(f"    Confidence: {rel.confidence:.2f}")


Explicit relations discovered: 22
  ELMo --[evaluates]--> Open AI GPT
    Confidence: 0.40
  classification --[uses]--> sentence pairs
    Confidence: 0.30
  NER --[evaluates]--> named entity recognition
    Confidence: 0.30
  ELMo --[proposes]--> named entity recognition
    Confidence: 0.20
  NER --[uses]--> paraphrasing
    Confidence: 0.20


In [11]:
embeddings_encoder = EntityEmbeddings()
entity_texts = [e['text'] for e in filtered_entities]
embeddings = embeddings_encoder.compute(entity_texts)

print(f"Embeddings generated: {embeddings.shape}")
print(f"Embedding dimension: {embeddings_encoder.dim()}")


Embeddings generated: (79, 384)
Embedding dimension: 384


In [12]:
explicit_pairs = set()

implicit_cluster = ImplicitCluster(k=8, tau_sim=0.45, tau_b=1)
implicit_relations_raw = implicit_cluster.infer(
    filtered_entities,
    embeddings,
    explicit_pairs
)

print(f"Implicit relations discovered: {len(implicit_relations_raw)}")

confidence_dist = {}
for rel in implicit_relations_raw:
    conf_bin = round(rel.confidence, 2)
    confidence_dist[conf_bin] = confidence_dist.get(conf_bin, 0) + 1

print("Confidence distribution:")
for conf in sorted(confidence_dist.keys(), reverse=True)[:10]:
    print(f"  {conf}: {confidence_dist[conf]}")


Implicit relations discovered: 215
Confidence distribution:
  0.5: 6
  0.38: 20
  0.25: 39
  0.12: 150


In [13]:
entity_types_map = {}
for e in filtered_entities:
    entity_types_map[e['text']] = EntityType(e['type'])

print(f"Entity type mapping: {len(entity_types_map)} entries")


Entity type mapping: 79 entries


In [14]:
implicit_typed = pair_filter.filter_relations(implicit_relations_raw, entity_types_map)

print(f"Implicit relations: {len(implicit_relations_raw)} → {len(implicit_typed)}")

high_quality_implicit = [r for r in implicit_typed if r.confidence >= 0.15]
print(f"High-quality implicit (≥0.15): {len(high_quality_implicit)}")

for rel in high_quality_implicit[:10]:
    e1_type = entity_types_map[rel.e_i].value
    e2_type = entity_types_map[rel.e_j].value
    print(f"  {e1_type:10s} → {e2_type:10s}: {rel.e_i} → {rel.e_j} ({rel.confidence:.3f})")


Implicit relations: 215 → 99
High-quality implicit (≥0.15): 29
  task       → dataset   : machine translation → CoLA The Corpus of Linguistic Acceptability (0.500)
  task       → dataset   : machine translation → natural language inference dataset (0.500)
  task       → dataset   : classification → pretraining data (0.375)
  task       → dataset   : next sentence prediction → Books Corpus (0.375)
  task       → dataset   : next sentence prediction → large text corpus (0.375)
  task       → dataset   : named entity recognition → shuffled sentence-level corpus (0.375)
  task       → dataset   : named entity recognition → monolingual corpus (0.375)
  task       → dataset   : machine translation → Stanford Sentiment Treebank (0.375)
  task       → dataset   : text generation → document-level corpus (0.375)
  task       → dataset   : text generation → shuffled sentence-level corpus (0.375)


In [15]:
G = nx.DiGraph()

for e in filtered_entities:
    G.add_node(e['text'], type=e['type'])

for rel in explicit_relations:
    G.add_edge(rel.e1, rel.e2, relation=rel.rel_type, 
               confidence=rel.confidence, edge_type='explicit')

for rel in implicit_typed:
    if rel.e_i in {e['text'] for e in filtered_entities}:
        if rel.e_j in {e['text'] for e in filtered_entities}:
            G.add_edge(rel.e_i, rel.e_j, confidence=rel.confidence, 
                       edge_type='implicit')

print(f"Knowledge graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

weakly_connected = list(nx.weakly_connected_components(G))
largest_cc = max(weakly_connected, key=len)
G_connected = G.subgraph(largest_cc).copy()

print(f"Connected component: {G_connected.number_of_nodes()} nodes, {G_connected.number_of_edges()} edges")


Knowledge graph: 79 nodes, 121 edges
Connected component: 56 nodes, 121 edges


In [16]:
relations_output = {
    "explicit": [
        {
            "source": r.e1,
            "target": r.e2,
            "type": r.rel_type,
            "confidence": r.confidence,
            "support": r.n_supporting
        }
        for r in explicit_relations
    ],
    "implicit": [
        {
            "source": r.e_i,
            "target": r.e_j,
            "confidence": r.confidence,
            "n_bridges": len(r.bridges),
            "bridges": r.bridges
        }
        for r in implicit_typed
    ]
}

relations_path = output_dir / f"{arxiv_id}_relations_final.json"
with open(relations_path, "w") as f:
    json.dump(relations_output, f, indent=2)

print(f"Relations saved to: {relations_path}")
print(f"Explicit: {len(explicit_relations)}, Implicit: {len(implicit_typed)}")


Relations saved to: output/1810.04805_relations_final.json
Explicit: 22, Implicit: 99


In [18]:
pos = nx.spring_layout(G_connected, k=2.5, iterations=50, seed=42)

explicit_edges = [
    (u, v) for u, v, d in G_connected.edges(data=True)
    if d.get('edge_type') == 'explicit'
]

implicit_edges = [
    (u, v) for u, v, d in G_connected.edges(data=True)
    if d.get('edge_type') == 'implicit'
]

type_to_color = {
    EntityType.TASK.value: '#FF6B6B',
    EntityType.METHOD.value: '#4ECDC4',
    EntityType.DATASET.value: '#45B7D1',
    EntityType.OBJECT.value: '#FFA07A',
    EntityType.METRIC.value: '#98D8C8',
}

node_colors = []
for node in G_connected.nodes():
    entity = next((e for e in filtered_entities if e['text'] == node), None)
    if entity:
        color = type_to_color.get(entity['type'], '#CCCCCC')
        node_colors.append(color)
    else:
        node_colors.append('#CCCCCC')

print(f"Layout & colors prepared")
print(f"Explicit edges: {len(explicit_edges)}, Implicit edges: {len(implicit_edges)}")


Layout & colors prepared
Explicit edges: 22, Implicit edges: 99


In [None]:
import gravis as gv


g = G_connected.copy()


type_to_color = {
    EntityType.TASK.value: '#FF6B6B',
    EntityType.METHOD.value: '#4ECDC4',
    EntityType.DATASET.value: '#45B7D1',
    EntityType.OBJECT.value: '#FFA07A',
    EntityType.METRIC.value: '#98D8C8',
}


for node in g.nodes():
    entity = next((e for e in filtered_entities if e['text'] == node), None)
    entity_type = entity['type'] if entity else 'unknown'
    color = type_to_color.get(entity_type, '#CCCCCC')
    
    g.nodes[node]['color'] = color
    g.nodes[node]['title'] = f"ID: {node}\nTYPE: {entity_type}"
    g.nodes[node]['size'] = 20
    g.nodes[node]['group'] = entity_type


for u, v, data in g.edges(data=True):
    edge_type = data.get('edge_type', 'implicit')
    
    if edge_type == 'explicit':
        g[u][v]['color'] = '#2E86AB'
        g[u][v]['width'] = 2.5
    else:
        g[u][v]['color'] = 'rgba(169, 169, 169, 0.4)'
        g[u][v]['width'] = 2
    
    g[u][v]['title'] = f"Type: {edge_type}"

fig = gv.vis(g)
fig
