In [1]:
import os

# Clone the GitHub repository
!git clone https://github.com/hasanmansoor96/RCS

# List the contents of the cloned repository to confirm
repo_name = "RCS"
if os.path.exists(repo_name):
    print(f"\nContents of '{repo_name}':")
    !ls -F {repo_name}
else:
    print(f"Error: Repository '{repo_name}' not found after cloning.")

Cloning into 'RCS'...
remote: Enumerating objects: 60, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (54/54), done.[K
remote: Total 60 (delta 25), reused 26 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (60/60), 2.35 MiB | 8.82 MiB/s, done.
Resolving deltas: 100% (25/25), done.
Filtering content: 100% (5/5), 246.72 MiB | 22.96 MiB/s, done.

Contents of 'RCS':
README.md  TemporalKGs/


# Phase 3: Baseline vs Influence-Aware Temporal Prediction

"
"This notebook answers the core question:
"
"**Does using `final_influence_graph.json` improve temporal link prediction?**

"
"It runs two conditions on ICEWS normalized files:
"
"1. Baseline temporal model (`TTransE-lite`)
"
"2. Baseline + influence-aware auxiliary updates

"
"Metrics: `Hits@3`, `Hits@5`, `Hits@10`, `MRR`
"


## 1) Why Time-Based Split Matters

"
"If train/valid/test are random across all years, the model can train on future events and then be evaluated on earlier events.
"
"That leaks temporal information and can inflate ranking metrics.

"
"For the question *"predict at time `t+Î”`"*, evaluation must respect chronology:
"
"- Train on earlier time windows
"
"- Tune on later windows
"
"- Test on the latest windows

"
"This notebook still supports your current split files, but the evaluation target is truly faithful only with chronological splitting.
"


In [17]:
from __future__ import annotations

import csv
import json
import random
import time
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import DefaultDict

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


def set_seed(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def get_or_add(mapping: dict[str, int], key: str) -> int:
    value = mapping.get(key)
    if value is None:
        value = len(mapping)
        mapping[key] = value
    return value


set_seed(7)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


In [18]:
# Config

SCRIPT_DIR = Path("/content/RCS/TemporalKGs")
DATA_DIR = SCRIPT_DIR / "/content/RCS/TemporalKGs/icews05-15_aug_inverse_time_year"
INFLUENCE_GRAPH_PATH = SCRIPT_DIR / "/content/RCS/TemporalKGs/final_influence_graph.json"
OUTPUT_DIR = SCRIPT_DIR / "/content/RCS/TemporalKGs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

RUN_MODE = "both"  # baseline | influence | both

EMB_DIM = 128
EPOCHS = 5
BATCH_SIZE = 2048
EVAL_BATCH_SIZE = 32
EVAL_MAX_SAMPLES = 20000  # <=0 for full split
LR = 1e-3
WEIGHT_DECAY = 1e-6
GRAD_CLIP = 5.0
LOG_EVERY_STEPS = 100

INFLUENCE_LAMBDA = 0.05
MAX_INFLUENCE_NEIGHBORS = 5


In [19]:
@dataclass
class TripleSplit:
    h: torch.Tensor
    r: torch.Tensor
    t: torch.Tensor
    time_id: torch.Tensor

    def __len__(self) -> int:
        return int(self.h.shape[0])


class TemporalTransELite(nn.Module):
    def __init__(self, num_entities: int, num_relations: int, num_times: int, emb_dim: int) -> None:
        super().__init__()
        self.entity_emb = nn.Embedding(num_entities, emb_dim)
        self.relation_emb = nn.Embedding(num_relations, emb_dim)
        self.time_emb = nn.Embedding(num_times, emb_dim)
        self.reset_parameters()

    def reset_parameters(self) -> None:
        nn.init.xavier_uniform_(self.entity_emb.weight)
        nn.init.xavier_uniform_(self.relation_emb.weight)
        nn.init.xavier_uniform_(self.time_emb.weight)

    def score_triples(
        self,
        h_idx: torch.Tensor,
        r_idx: torch.Tensor,
        t_idx: torch.Tensor,
        time_idx: torch.Tensor,
    ) -> torch.Tensor:
        h = self.entity_emb(h_idx)
        r = self.relation_emb(r_idx)
        tail = self.entity_emb(t_idx)
        time_vec = self.time_emb(time_idx)
        return -(h + r + time_vec - tail).abs().sum(dim=1)

    def score_all_tails(
        self,
        h_idx: torch.Tensor,
        r_idx: torch.Tensor,
        time_idx: torch.Tensor,
    ) -> torch.Tensor:
        query = self.entity_emb(h_idx) + self.relation_emb(r_idx) + self.time_emb(time_idx)
        return -torch.cdist(query, self.entity_emb.weight, p=1.0)


In [20]:
def encode_split(
    path: Path,
    entity_to_id: dict[str, int],
    relation_to_id: dict[str, int],
    time_to_id: dict[int, int],
) -> TripleSplit:
    h_idx: list[int] = []
    r_idx: list[int] = []
    t_idx: list[int] = []
    time_idx: list[int] = []

    with path.open("r", encoding="utf-8") as f:
        reader = csv.DictReader(f, delimiter="	")
        for row in reader:
            h_idx.append(get_or_add(entity_to_id, row["head"]))
            r_idx.append(get_or_add(relation_to_id, row["relation"]))
            t_idx.append(get_or_add(entity_to_id, row["tail"]))
            time_raw = int(row["time_index"])
            time_idx.append(get_or_add(time_to_id, time_raw))

    return TripleSplit(
        h=torch.tensor(h_idx, dtype=torch.long),
        r=torch.tensor(r_idx, dtype=torch.long),
        t=torch.tensor(t_idx, dtype=torch.long),
        time_id=torch.tensor(time_idx, dtype=torch.long),
    )


def load_data(data_dir: Path) -> tuple[TripleSplit, TripleSplit, TripleSplit, dict[str, int], dict[str, int], dict[int, int]]:
    train_path = data_dir / "/content/RCS/TemporalKGs/icews05-15_aug_inverse_time_year/icews_2005-2015_train_normalized.txt"
    valid_path = data_dir / "/content/RCS/TemporalKGs/icews05-15_aug_inverse_time_year/icews_2005-2015_valid_normalized.txt"
    test_path = data_dir / "/content/RCS/TemporalKGs/icews05-15_aug_inverse_time_year/icews_2005-2015_test_normalized.txt"

    for p in (train_path, valid_path, test_path):
        if not p.exists():
            raise FileNotFoundError(f"Missing data file: {p}")

    entity_to_id: dict[str, int] = {}
    relation_to_id: dict[str, int] = {}
    time_to_id: dict[int, int] = {}

    train = encode_split(train_path, entity_to_id, relation_to_id, time_to_id)
    valid = encode_split(valid_path, entity_to_id, relation_to_id, time_to_id)
    test = encode_split(test_path, entity_to_id, relation_to_id, time_to_id)

    print(
        "Loaded splits: "
        f"train={len(train):,}, valid={len(valid):,}, test={len(test):,}, "
        f"entities={len(entity_to_id):,}, relations={len(relation_to_id):,}, times={len(time_to_id):,}"
    )
    return train, valid, test, entity_to_id, relation_to_id, time_to_id


def build_true_tails(*splits: TripleSplit) -> DefaultDict[tuple[int, int, int], set[int]]:
    true_tails: DefaultDict[tuple[int, int, int], set[int]] = defaultdict(set)
    for split in splits:
        h_np = split.h.numpy()
        r_np = split.r.numpy()
        t_np = split.t.numpy()
        time_np = split.time_id.numpy()
        for h, r, tail, ti in zip(h_np, r_np, t_np, time_np, strict=True):
            true_tails[(int(h), int(r), int(ti))].add(int(tail))
    return true_tails


In [21]:
def load_influence_adjacency(
    influence_graph_path: Path,
    entity_to_id: dict[str, int],
    max_neighbors: int,
) -> dict[int, list[tuple[int, float]]]:
    if not influence_graph_path.exists():
        raise FileNotFoundError(f"Influence graph not found: {influence_graph_path}")

    with influence_graph_path.open("r", encoding="utf-8") as f:
        raw_graph = json.load(f)

    adjacency: dict[int, list[tuple[int, float]]] = {}
    total_edges = 0

    for src_name, raw_neighbors in raw_graph.items():
        src_id = entity_to_id.get(src_name)
        if src_id is None:
            continue

        neighbors_sorted = sorted(raw_neighbors.items(), key=lambda kv: kv[1], reverse=True)
        kept_neighbors: list[tuple[int, float]] = []

        for dst_name, weight in neighbors_sorted:
            dst_id = entity_to_id.get(dst_name)
            if dst_id is None or dst_id == src_id:
                continue
            w = float(weight)
            if w <= 0.0:
                continue
            kept_neighbors.append((dst_id, w))
            if max_neighbors > 0 and len(kept_neighbors) >= max_neighbors:
                break

        if kept_neighbors:
            adjacency[src_id] = kept_neighbors
            total_edges += len(kept_neighbors)

    print(f"Influence adjacency: sources={len(adjacency):,}, edges={total_edges:,}")
    return adjacency


def compute_influence_loss(
    model: TemporalTransELite,
    batch_entities_cpu: torch.Tensor,
    influence_adjacency: dict[int, list[tuple[int, float]]],
    device: torch.device,
) -> torch.Tensor:
    src_ids: list[int] = []
    dst_ids: list[int] = []
    weights: list[float] = []

    for src_id in batch_entities_cpu.tolist():
        neighbors = influence_adjacency.get(int(src_id))
        if not neighbors:
            continue
        for dst_id, w in neighbors:
            src_ids.append(int(src_id))
            dst_ids.append(int(dst_id))
            weights.append(float(w))

    if not src_ids:
        return torch.zeros((), device=device)

    src_tensor = torch.tensor(src_ids, dtype=torch.long, device=device)
    dst_tensor = torch.tensor(dst_ids, dtype=torch.long, device=device)
    w_tensor = torch.tensor(weights, dtype=torch.float32, device=device)

    diff = model.entity_emb(src_tensor) - model.entity_emb(dst_tensor)
    return (w_tensor * diff.pow(2).sum(dim=1)).mean()


In [22]:
def train_one_condition(
    model: TemporalTransELite,
    train_data: TripleSplit,
    num_entities: int,
    influence_adjacency: dict[int, list[tuple[int, float]]] | None = None,
    influence_lambda: float = 0.0,
) -> None:
    optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    num_train = len(train_data)
    step = 0

    for epoch in range(1, EPOCHS + 1):
        epoch_start = time.time()
        permutation = torch.randperm(num_train)
        epoch_loss = 0.0
        batches = 0

        model.train()
        for start in range(0, num_train, BATCH_SIZE):
            end = min(start + BATCH_SIZE, num_train)
            idx = permutation[start:end]

            h_cpu = train_data.h[idx]
            r_cpu = train_data.r[idx]
            t_cpu = train_data.t[idx]
            time_cpu = train_data.time_id[idx]
            neg_t_cpu = torch.randint(0, num_entities, (len(idx),), dtype=torch.long)

            h = h_cpu.to(device)
            r = r_cpu.to(device)
            t = t_cpu.to(device)
            time_idx = time_cpu.to(device)
            neg_t = neg_t_cpu.to(device)

            pos_scores = model.score_triples(h, r, t, time_idx)
            neg_scores = model.score_triples(h, r, neg_t, time_idx)
            base_loss = -F.logsigmoid(pos_scores).mean() - F.logsigmoid(-neg_scores).mean()

            influence_loss = torch.zeros((), device=device)
            if influence_adjacency and influence_lambda > 0.0:
                batch_entities = torch.unique(torch.cat([h_cpu, t_cpu], dim=0))
                influence_loss = compute_influence_loss(model, batch_entities, influence_adjacency, device)

            loss = base_loss + influence_lambda * influence_loss

            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            if GRAD_CLIP > 0:
                nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
            optimizer.step()

            with torch.no_grad():
                model.entity_emb.weight.data = F.normalize(model.entity_emb.weight.data, p=2, dim=1)
                model.relation_emb.weight.data = F.normalize(model.relation_emb.weight.data, p=2, dim=1)
                model.time_emb.weight.data = F.normalize(model.time_emb.weight.data, p=2, dim=1)

            step += 1
            batches += 1
            epoch_loss += float(loss.detach().cpu())

            if LOG_EVERY_STEPS > 0 and step % LOG_EVERY_STEPS == 0:
                print(f"step={step:,} epoch={epoch}/{EPOCHS} loss={float(loss):.4f}")

        print(
            f"epoch={epoch}/{EPOCHS} avg_loss={epoch_loss / max(1, batches):.4f} "
            f"time={time.time() - epoch_start:.1f}s"
        )


@torch.no_grad()
def evaluate_hits_at_k(
    model: TemporalTransELite,
    split: TripleSplit,
    true_tails: DefaultDict[tuple[int, int, int], set[int]],
) -> dict[str, float]:
    model.eval()
    n_total = len(split)
    n_eval = min(n_total, EVAL_MAX_SAMPLES) if EVAL_MAX_SAMPLES > 0 else n_total
    indices = torch.arange(n_eval, dtype=torch.long)

    hits3 = 0
    hits5 = 0
    hits10 = 0
    mrr_sum = 0.0

    for start in range(0, n_eval, EVAL_BATCH_SIZE):
        end = min(start + EVAL_BATCH_SIZE, n_eval)
        idx = indices[start:end]

        h_cpu = split.h[idx]
        r_cpu = split.r[idx]
        t_cpu = split.t[idx]
        time_cpu = split.time_id[idx]

        h = h_cpu.to(device)
        r = r_cpu.to(device)
        t = t_cpu.to(device)
        ti = time_cpu.to(device)

        scores = model.score_all_tails(h, r, ti)
        bsz = int(scores.shape[0])

        for i in range(bsz):
            key = (int(h_cpu[i]), int(r_cpu[i]), int(time_cpu[i]))
            true_tail = int(t_cpu[i])
            for other_true in true_tails[key]:
                if other_true != true_tail:
                    scores[i, other_true] = -1e9

        true_scores = scores[torch.arange(bsz, device=device), t]
        ranks = 1 + torch.sum(scores > true_scores.unsqueeze(1), dim=1)
        ranks_cpu = ranks.detach().cpu()

        hits3 += int((ranks_cpu <= 3).sum().item())
        hits5 += int((ranks_cpu <= 5).sum().item())
        hits10 += int((ranks_cpu <= 10).sum().item())
        mrr_sum += float((1.0 / ranks_cpu.float()).sum().item())

    denom = float(n_eval)
    return {
        "num_eval_samples": int(n_eval),
        "hits@3": hits3 / denom,
        "hits@5": hits5 / denom,
        "hits@10": hits10 / denom,
        "mrr": mrr_sum / denom,
    }


def run_condition(
    name: str,
    train_data: TripleSplit,
    valid_data: TripleSplit,
    test_data: TripleSplit,
    true_tails: DefaultDict[tuple[int, int, int], set[int]],
    num_entities: int,
    num_relations: int,
    num_times: int,
    influence_adjacency: dict[int, list[tuple[int, float]]] | None,
    influence_lambda: float,
) -> dict[str, object]:
    print(f"\n=== Running {name} ===")
    model = TemporalTransELite(num_entities, num_relations, num_times, EMB_DIM).to(device)

    train_one_condition(
        model=model,
        train_data=train_data,
        num_entities=num_entities,
        influence_adjacency=influence_adjacency,
        influence_lambda=influence_lambda,
    )

    print(f"Evaluating {name} on valid...")
    valid_metrics = evaluate_hits_at_k(model, valid_data, true_tails)
    print(valid_metrics)

    print(f"Evaluating {name} on test...")
    test_metrics = evaluate_hits_at_k(model, test_data, true_tails)
    print(test_metrics)

    return {
        "condition": name,
        "influence_lambda": influence_lambda,
        "valid": valid_metrics,
        "test": test_metrics,
    }


In [23]:
# Load data and run experiments

train_data, valid_data, test_data, entity_to_id, relation_to_id, time_to_id = load_data(DATA_DIR)
true_tails = build_true_tails(train_data, valid_data, test_data)

num_entities = len(entity_to_id)
num_relations = len(relation_to_id)
num_times = len(time_to_id)

baseline_result = None
influence_result = None

if RUN_MODE in {"baseline", "both"}:
    baseline_result = run_condition(
        name="baseline",
        train_data=train_data,
        valid_data=valid_data,
        test_data=test_data,
        true_tails=true_tails,
        num_entities=num_entities,
        num_relations=num_relations,
        num_times=num_times,
        influence_adjacency=None,
        influence_lambda=0.0,
    )

if RUN_MODE in {"influence", "both"}:
    influence_adjacency = load_influence_adjacency(
        influence_graph_path=INFLUENCE_GRAPH_PATH,
        entity_to_id=entity_to_id,
        max_neighbors=MAX_INFLUENCE_NEIGHBORS,
    )
    influence_result = run_condition(
        name="influence_aware",
        train_data=train_data,
        valid_data=valid_data,
        test_data=test_data,
        true_tails=true_tails,
        num_entities=num_entities,
        num_relations=num_relations,
        num_times=num_times,
        influence_adjacency=influence_adjacency,
        influence_lambda=INFLUENCE_LAMBDA,
    )


Loaded splits: train=1,106,886, valid=138,825, test=138,276, entities=10,488, relations=2,775, times=132

=== Running baseline ===
step=100 epoch=1/5 loss=17.0648
step=200 epoch=1/5 loss=15.7159
step=300 epoch=1/5 loss=13.9225
step=400 epoch=1/5 loss=11.6179
step=500 epoch=1/5 loss=8.8352
epoch=1/5 avg_loss=13.8943 time=17.6s
step=600 epoch=2/5 loss=6.1950
step=700 epoch=2/5 loss=4.2671
step=800 epoch=2/5 loss=2.9889
step=900 epoch=2/5 loss=2.4585
step=1,000 epoch=2/5 loss=2.1411
epoch=2/5 avg_loss=3.5414 time=16.4s
step=1,100 epoch=3/5 loss=1.7280
step=1,200 epoch=3/5 loss=1.5590
step=1,300 epoch=3/5 loss=1.4753
step=1,400 epoch=3/5 loss=1.3893
step=1,500 epoch=3/5 loss=1.3052
step=1,600 epoch=3/5 loss=1.2370
epoch=3/5 avg_loss=1.4485 time=16.8s
step=1,700 epoch=4/5 loss=1.1994
step=1,800 epoch=4/5 loss=1.1722
step=1,900 epoch=4/5 loss=1.1051
step=2,000 epoch=4/5 loss=1.1179
step=2,100 epoch=4/5 loss=1.0787
epoch=4/5 avg_loss=1.1542 time=17.3s
step=2,200 epoch=5/5 loss=1.0743
step=2,3

In [24]:
# Save results

results = {
    "config": {
        "run_mode": RUN_MODE,
        "emb_dim": EMB_DIM,
        "epochs": EPOCHS,
        "batch_size": BATCH_SIZE,
        "eval_batch_size": EVAL_BATCH_SIZE,
        "eval_max_samples": EVAL_MAX_SAMPLES,
        "lr": LR,
        "weight_decay": WEIGHT_DECAY,
        "influence_lambda": INFLUENCE_LAMBDA,
        "max_influence_neighbors": MAX_INFLUENCE_NEIGHBORS,
        "device": str(device),
    },
    "dataset_stats": {
        "train": len(train_data),
        "valid": len(valid_data),
        "test": len(test_data),
        "entities": len(entity_to_id),
        "relations": len(relation_to_id),
        "times": len(time_to_id),
    },
    "results": {},
}

if baseline_result is not None:
    results["results"]["baseline"] = baseline_result
if influence_result is not None:
    results["results"]["influence_aware"] = influence_result

if baseline_result is not None and influence_result is not None:
    delta_test = {}
    for metric in ("hits@3", "hits@5", "hits@10", "mrr"):
        delta_test[metric] = float(influence_result["test"][metric]) - float(baseline_result["test"][metric])
    results["results"]["delta_test"] = delta_test
    print("\nTest delta (influence - baseline):", delta_test)

out_path = OUTPUT_DIR / "phase3_results.json"
with out_path.open("w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

print(f"Saved: {out_path}")



Test delta (influence - baseline): {'hits@3': 0.0058, 'hits@5': 0.005850000000000022, 'hits@10': 0.0020000000000000018, 'mrr': 0.0017051832199096728}
Saved: /content/RCS/TemporalKGs/phase3_results.json
