In [8]:
import os

# Clone the GitHub repository
!git clone https://github.com/hasanmansoor96/RCS

# List the contents of the cloned repository to confirm
repo_name = "RCS"
if os.path.exists(repo_name):
    print(f"\nContents of '{repo_name}':")
    !ls -F {repo_name}
else:
    print(f"Error: Repository '{repo_name}' not found after cloning.")

fatal: destination path 'RCS' already exists and is not an empty directory.

Contents of 'RCS':
TemporalKGs/


In [10]:
import os
import shutil

# Remove the existing RCS directory if it exists
repo_name = "RCS"
if os.path.exists(repo_name):
    shutil.rmtree(repo_name)
    print(f"Removed existing directory '{repo_name}'.")

# Clone the GitHub repository
!git clone https://github.com/hasanmansoor96/RCS

# List the contents of the cloned repository to confirm
if os.path.exists(repo_name):
    print(f"\nContents of '{repo_name}':")
    !ls -F {repo_name}
else:
    print(f"Error: Repository '{repo_name}' not found after cloning.")

Removed existing directory 'RCS'.
Cloning into 'RCS'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 34 (delta 12), reused 16 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (34/34), 195.68 KiB | 6.31 MiB/s, done.
Resolving deltas: 100% (12/12), done.
Filtering content: 100% (5/5), 246.72 MiB | 42.34 MiB/s, done.

Contents of 'RCS':
TemporalKGs/


In [11]:
import json

json_dir = '/content/RCS/TemporalKGs/'

granger_matrix = None
embedding_similarity_matrix = None
co_occurrence_matrix = None

# Load granger_influence_matrix_optimized.json
try:
    with open(f'{json_dir}granger_influence_matrix_optimized.json', 'r') as f:
        granger_matrix = json.load(f)
    print("Loaded 'granger_influence_matrix_optimized.json' into 'granger_matrix'.")
except json.JSONDecodeError as e:
    print(f"Error loading 'granger_influence_matrix_optimized.json': {e}")
except FileNotFoundError:
    print(f"Error: 'granger_influence_matrix_optimized.json' not found at {json_dir}")

# Load emb_sim_matrix.json
try:
    with open(f'{json_dir}emb_sim_matrix.json', 'r') as f:
        embedding_similarity_matrix = json.load(f)
    print("Loaded 'emb_sim_matrix.json' into 'embedding_similarity_matrix'.")
except json.JSONDecodeError as e:
    print(f"Error loading 'emb_sim_matrix.json': {e}\nThis file appears to be corrupted or malformed JSON.")
except FileNotFoundError:
    print(f"Error: 'emb_sim_matrix.json' not found at {json_dir}")

# Load sparse_influence_matrix.json
try:
    with open(f'{json_dir}sparse_influence_matrix.json', 'r') as f:
        co_occurrence_matrix = json.load(f)
    print("Loaded 'sparse_influence_matrix.json' into 'co_occurrence_matrix'.")
except json.JSONDecodeError as e:
    print(f"Error loading 'sparse_influence_matrix.json': {e}")
except FileNotFoundError:
    print(f"Error: 'sparse_influence_matrix.json' not found at {json_dir}")

# Check if all files were loaded successfully
if granger_matrix is not None and embedding_similarity_matrix is not None and co_occurrence_matrix is not None:
    print("All three JSON files loaded successfully.")
elif embedding_similarity_matrix is None:
    print("Subtask partially completed: 'emb_sim_matrix.json' could not be loaded due to corruption. Cannot proceed with the full task without this file.")
else:
    print("Subtask partially completed: Some files could not be loaded.")

Loaded 'granger_influence_matrix_optimized.json' into 'granger_matrix'.
Loaded 'emb_sim_matrix.json' into 'embedding_similarity_matrix'.
Loaded 'sparse_influence_matrix.json' into 'co_occurrence_matrix'.
All three JSON files loaded successfully.


# Task
Generate a fused influence graph from the provided Granger causality matrix (`/content/RCS/TemporalKGs/granger_influence_matrix_optimized.json`), embedding similarity matrix (`/content/RCS/TemporalKGs/emb_sim_matrix.json`), and co-occurrence matrix (`/content/RCS/TemporalKGs/sparse_influence_matrix.json`). The process includes summarizing the embedding similarity matrix, building a master entity index, aligning and normalizing all three signals, fusing them using a weighted sum (co-occurrence: 0.4, Granger: 0.4, embedding similarity: 0.2), and then sparsifying the fused graph by retaining the top 30 neighbors and applying a minimum weight threshold of 0.05. Finally, export the resulting graph to `final_influence_graph.json` and provide a diagnostics report summarizing the process.

## Summarize Embedding Similarity Matrix

### Subtask:
Provide a summary of the loaded `embedding_similarity_matrix`, including the total number of entities and sample entries to confirm its structure and content. This confirms the successful loading and readiness of this crucial input.


**Reasoning**:
To summarize the `embedding_similarity_matrix`, I will first print the total number of unique entities (keys) to understand its size, and then display a few sample entries to confirm its structure and content as requested by the subtask.



In [12]:
print(f"Total number of unique entities in embedding_similarity_matrix: {len(embedding_similarity_matrix)}")

print("\nSample entries from embedding_similarity_matrix:")
count = 0
for entity, similarities in embedding_similarity_matrix.items():
    if count < 3: # Print first 3 entries
        print(f"  Entity: '{entity}'")
        print(f"    Sample Similarities: {list(similarities.items())[:3]}...") # Print first 3 similarities for each entity
        count += 1
    else:
        break

Total number of unique entities in embedding_similarity_matrix: 3413

Sample entries from embedding_similarity_matrix:
  Entity: 'Solomon Passy'
    Sample Similarities: [('Women (Austria)', 0.0), ('Tyco International', 0.0), ('Piletji Sebola', 0.0)]...
  Entity: 'Wolfgang SchÃ¼ssel'
    Sample Similarities: [('Azali Assouman', 0.189811), ('Zapatista Army of National Liberation', 0.064076), ('Piletji Sebola', 0.081682)]...
  Entity: 'Ho Mei-yueh'
    Sample Similarities: [('Provisional Irish Republican Army', 0.0), ('Carlo Azeglio Ciampi', 0.0), ('Huang Ju', 0.203046)]...


## Build Master Entity Index and Align Signals

### Subtask:
Extract all unique entities (nodes) from the loaded Granger, embedding similarity, and co-occurrence matrices to form a comprehensive master entity index. Create aligned data structures (e.g., pandas DataFrames or nested dictionaries) for each signal, ensuring all entities from the master index are present and missing values are handled (e.g., filled with 0).


**Reasoning**:
To begin, I will create a `master_entity_set` and populate it with all unique entities from the keys of the `granger_matrix`, `embedding_similarity_matrix`, and `co_occurrence_matrix`. This ensures that every entity present in any of the three input matrices is captured.



In [13]:
master_entity_set = set()

# Extract entities from granger_matrix
for entity in granger_matrix.keys():
    master_entity_set.add(entity)
    for target_entity in granger_matrix[entity].keys():
        master_entity_set.add(target_entity)

# Extract entities from embedding_similarity_matrix
for entity in embedding_similarity_matrix.keys():
    master_entity_set.add(entity)
    for target_entity in embedding_similarity_matrix[entity].keys():
        master_entity_set.add(target_entity)

# Extract entities from co_occurrence_matrix
for entity in co_occurrence_matrix.keys():
    master_entity_set.add(entity)
    for target_entity in co_occurrence_matrix[entity].keys():
        master_entity_set.add(target_entity)

print(f"Master entity set created with {len(master_entity_set)} unique entities.")

Master entity set created with 5396 unique entities.


**Reasoning**:
Now that the `master_entity_set` is created, I will convert it into a sorted list named `master_entity_index`. Then, I will define the `align_matrix` helper function as described in the instructions to align each of the three matrices (`granger_matrix`, `embedding_similarity_matrix`, and `co_occurrence_matrix`) with the `master_entity_index`, handling missing values by setting them to 0. Finally, I will apply this function to create the aligned matrices.



In [14]:
master_entity_index = sorted(list(master_entity_set))

print(f"Master entity index created with {len(master_entity_index)} sorted entities.")

def align_matrix(original_matrix, master_index):
    aligned_matrix = {}
    for source_entity in master_index:
        target_scores = {}
        for target_entity in master_index:
            # Get score, defaulting to 0 if not found
            score = original_matrix.get(source_entity, {}).get(target_entity, 0)
            target_scores[target_entity] = score
        aligned_matrix[source_entity] = target_scores
    return aligned_matrix

# Align all three matrices
aligned_granger_matrix = align_matrix(granger_matrix, master_entity_index)
aligned_embedding_similarity_matrix = align_matrix(embedding_similarity_matrix, master_entity_index)
aligned_co_occurrence_matrix = align_matrix(co_occurrence_matrix, master_entity_index)

print("Granger matrix aligned successfully. Sample entry:")
# Print a sample entry from aligned_granger_matrix to verify
if aligned_granger_matrix:
    first_entity = next(iter(aligned_granger_matrix))
    print(f"  Entity: '{first_entity}'")
    print(f"    Sample Target Scores: {list(aligned_granger_matrix[first_entity].items())[:3]}...")

print("Embedding similarity matrix aligned successfully. Sample entry:")
# Print a sample entry from aligned_embedding_similarity_matrix to verify
if aligned_embedding_similarity_matrix:
    first_entity = next(iter(aligned_embedding_similarity_matrix))
    print(f"  Entity: '{first_entity}'")
    print(f"    Sample Target Scores: {list(aligned_embedding_similarity_matrix[first_entity].items())[:3]}...")

print("Co-occurrence matrix aligned successfully. Sample entry:")
# Print a sample entry from aligned_co_occurrence_matrix to verify
if aligned_co_occurrence_matrix:
    first_entity = next(iter(aligned_co_occurrence_matrix))
    print(f"  Entity: '{first_entity}'")
    print(f"    Sample Target Scores: {list(aligned_co_occurrence_matrix[first_entity].items())[:3]}...")


Master entity index created with 5396 sorted entities.
Granger matrix aligned successfully. Sample entry:
  Entity: 'A.B. Bardhan'
    Sample Target Scores: [('A.B. Bardhan', 0), ('A.K. Antony', 0), ('A.P. Sharma', 0)]...
Embedding similarity matrix aligned successfully. Sample entry:
  Entity: 'A.B. Bardhan'
    Sample Target Scores: [('A.B. Bardhan', 0), ('A.K. Antony', 0), ('A.P. Sharma', 0)]...
Co-occurrence matrix aligned successfully. Sample entry:
  Entity: 'A.B. Bardhan'
    Sample Target Scores: [('A.B. Bardhan', 0), ('A.K. Antony', 0), ('A.P. Sharma', 0)]...


## Normalize Signals

### Subtask:
For each of the three aligned signals (Granger, Embedding Similarity, Co-occurrence), perform row-wise (per source entity) min-max normalization. Ensure robust handling of rows with zero variance (e.g., all zeros), typically by setting their normalized scores to 0 or 1 based on context.


**Reasoning**:
As instructed, I will define a function `normalize_matrix` to perform row-wise min-max normalization on the aligned matrices, handling cases of zero variance by setting normalized scores to 0. Then, I will apply this function to `aligned_granger_matrix`, `aligned_embedding_similarity_matrix`, and `aligned_co_occurrence_matrix`, and print sample entries to verify the normalization.



In [15]:
def normalize_matrix(matrix, master_index):
    normalized_matrix = {}
    for source_entity in master_index:
        target_scores = list(matrix.get(source_entity, {}).values())
        if not target_scores: # Handle cases where source_entity might not be in the original matrix keys but is in master_index
            normalized_matrix[source_entity] = {target_e: 0 for target_e in master_index}
            continue

        min_val = min(target_scores)
        max_val = max(target_scores)

        normalized_target_scores = {}
        if min_val == max_val: # Handle zero variance case, set all to 0 as per instructions
            for target_entity in master_index:
                normalized_target_scores[target_entity] = 0.0
        else:
            for target_entity in master_index:
                score = matrix.get(source_entity, {}).get(target_entity, 0.0) # Ensure score is float
                normalized_score = (score - min_val) / (max_val - min_val)
                normalized_target_scores[target_entity] = normalized_score
        normalized_matrix[source_entity] = normalized_target_scores
    return normalized_matrix

# Apply normalization to all three aligned matrices
normalized_granger_matrix = normalize_matrix(aligned_granger_matrix, master_entity_index)
normalized_embedding_similarity_matrix = normalize_matrix(aligned_embedding_similarity_matrix, master_entity_index)
normalized_co_occurrence_matrix = normalize_matrix(aligned_co_occurrence_matrix, master_entity_index)

print("Granger matrix normalized successfully. Sample entry (original vs normalized):")
if normalized_granger_matrix:
    first_entity = next(iter(master_entity_index))
    print(f"  Entity: '{first_entity}'")
    print(f"    Original Sample Target Scores: {list(aligned_granger_matrix[first_entity].items())[:3]}...")
    print(f"    Normalized Sample Target Scores: {list(normalized_granger_matrix[first_entity].items())[:3]}...")

print("\nEmbedding similarity matrix normalized successfully. Sample entry (original vs normalized):")
if normalized_embedding_similarity_matrix:
    first_entity = next(iter(master_entity_index))
    print(f"  Entity: '{first_entity}'")
    print(f"    Original Sample Target Scores: {list(aligned_embedding_similarity_matrix[first_entity].items())[:3]}...")
    print(f"    Normalized Sample Target Scores: {list(normalized_embedding_similarity_matrix[first_entity].items())[:3]}...")

print("\nCo-occurrence matrix normalized successfully. Sample entry (original vs normalized):")
if normalized_co_occurrence_matrix:
    first_entity = next(iter(master_entity_index))
    print(f"  Entity: '{first_entity}'")
    print(f"    Original Sample Target Scores: {list(aligned_co_occurrence_matrix[first_entity].items())[:3]}...")
    print(f"    Normalized Sample Target Scores: {list(normalized_co_occurrence_matrix[first_entity].items())[:3]}...")


Granger matrix normalized successfully. Sample entry (original vs normalized):
  Entity: 'A.B. Bardhan'
    Original Sample Target Scores: [('A.B. Bardhan', 0), ('A.K. Antony', 0), ('A.P. Sharma', 0)]...
    Normalized Sample Target Scores: [('A.B. Bardhan', 0.0), ('A.K. Antony', 0.0), ('A.P. Sharma', 0.0)]...

Embedding similarity matrix normalized successfully. Sample entry (original vs normalized):
  Entity: 'A.B. Bardhan'
    Original Sample Target Scores: [('A.B. Bardhan', 0), ('A.K. Antony', 0), ('A.P. Sharma', 0)]...
    Normalized Sample Target Scores: [('A.B. Bardhan', 0.0), ('A.K. Antony', 0.0), ('A.P. Sharma', 0.0)]...

Co-occurrence matrix normalized successfully. Sample entry (original vs normalized):
  Entity: 'A.B. Bardhan'
    Original Sample Target Scores: [('A.B. Bardhan', 0), ('A.K. Antony', 0), ('A.P. Sharma', 0)]...
    Normalized Sample Target Scores: [('A.B. Bardhan', 0.0), ('A.K. Antony', 0.0), ('A.P. Sharma', 0.0)]...


**Reasoning**:
The next step is to fuse the normalized matrices using the specified weighted sum. I will define a function to perform this weighted sum for each entity and its targets across the three normalized matrices and then apply it to create the `fused_influence_matrix`.



In [16]:
def fuse_matrices(granger_matrix, embedding_matrix, co_occurrence_matrix, master_index, weights):
    fused_matrix = {}
    for source_entity in master_index:
        fused_target_scores = {}
        for target_entity in master_index:
            granger_score = granger_matrix.get(source_entity, {}).get(target_entity, 0.0)
            embedding_score = embedding_matrix.get(source_entity, {}).get(target_entity, 0.0)
            co_occurrence_score = co_occurrence_matrix.get(source_entity, {}).get(target_entity, 0.0)

            fused_score = (
                weights['co_occurrence'] * co_occurrence_score +
                weights['granger'] * granger_score +
                weights['embedding_similarity'] * embedding_score
            )
            fused_target_scores[target_entity] = fused_score
        fused_matrix[source_entity] = fused_target_scores
    return fused_matrix

# Define weights
weights = {
    'co_occurrence': 0.4,
    'granger': 0.4,
    'embedding_similarity': 0.2
}

# Fuse the normalized matrices
fused_influence_matrix = fuse_matrices(
    normalized_granger_matrix,
    normalized_embedding_similarity_matrix,
    normalized_co_occurrence_matrix,
    master_entity_index,
    weights
)

print("Fused influence matrix created successfully. Sample entry:")
if fused_influence_matrix:
    first_entity = next(iter(master_entity_index))
    print(f"  Entity: '{first_entity}'")
    print(f"    Sample Fused Scores: {list(fused_influence_matrix[first_entity].items())[:5]}...")


Fused influence matrix created successfully. Sample entry:
  Entity: 'A.B. Bardhan'
    Sample Fused Scores: [('A.B. Bardhan', 0.0), ('A.K. Antony', 0.0), ('A.P. Sharma', 0.0), ('A.S.S. Hannan Shah', 0.0), ('ACT Party', 0.1803330732599747)]...


**Reasoning**:
The next step is to sparsify the `fused_influence_matrix` as per the task instructions. This involves retaining only the top 30 neighbors for each source entity and applying a minimum weight threshold of 0.05. I will iterate through the fused matrix, filter based on the threshold, sort by score, and then select the top N neighbors.



In [17]:
import json

# Define sparsification parameters
TOP_N_NEIGHBORS = 30
MIN_WEIGHT_THRESHOLD = 0.05

sparsified_influence_graph = {}
for source_entity, target_scores in fused_influence_matrix.items():
    # Filter targets by minimum weight threshold
    filtered_targets = {target: score for target, score in target_scores.items() if score >= MIN_WEIGHT_THRESHOLD}

    # Sort targets by score in descending order and get the top N
    sorted_targets = sorted(filtered_targets.items(), key=lambda item: item[1], reverse=True)
    top_n_targets = dict(sorted_targets[:TOP_N_NEIGHBORS])

    if top_n_targets:
        sparsified_influence_graph[source_entity] = top_n_targets

print(f"Sparsified influence graph created. Total entities with outgoing edges: {len(sparsified_influence_graph)}")

# Print a sample entry from the sparsified graph to verify
print("\nSample entry from sparsified_influence_graph:")
if sparsified_influence_graph:
    first_entity = next(iter(sparsified_influence_graph))
    print(f"  Entity: '{first_entity}'")
    print(f"    Sparsified Target Scores: {list(sparsified_influence_graph[first_entity].items())[:5]}...")
else:
    print("  No entries in the sparsified graph. (This might indicate all scores were below threshold)")


Sparsified influence graph created. Total entities with outgoing edges: 5395

Sample entry from sparsified_influence_graph:
  Entity: 'A.B. Bardhan'
    Sparsified Target Scores: [('Islamic Extremist Group (Pakistan)', 0.8), ('Alliance for the Future', 0.7812499700547495), ('Aide Medicale Internationale', 0.4), ('Alvaro Leyva', 0.4), ('Criminal (Barbados)', 0.4)]...


**Reasoning**:
Now that the influence graph has been sparsified, I will save the `sparsified_influence_graph` to a JSON file named `final_influence_graph.json` as specified in the task.



In [18]:
output_file_path = 'final_influence_graph.json'
with open(output_file_path, 'w') as f:
    json.dump(sparsified_influence_graph, f, indent=4)

print(f"Sparsified influence graph saved to '{output_file_path}'.")

Sparsified influence graph saved to 'final_influence_graph.json'.


## Diagnostics Report

This report summarizes the process of generating the fused influence graph:

1.  **Data Loading**: Successfully loaded three JSON files:
    *   `granger_influence_matrix_optimized.json` (into `granger_matrix`)
    *   `emb_sim_matrix.json` (into `embedding_similarity_matrix`)
    *   `sparse_influence_matrix.json` (into `co_occurrence_matrix`)

2.  **Embedding Similarity Matrix Summary**: Confirmed `embedding_similarity_matrix` contained 3413 unique entities.

3.  **Master Entity Index**: A `master_entity_index` was created, containing 5396 unique entities, combining all entities from the three input matrices.

4.  **Signal Alignment**: All three matrices were successfully aligned to the `master_entity_index`, with missing values filled as 0.

5.  **Signal Normalization**: Each aligned matrix (`granger`, `embedding similarity`, `co-occurrence`) was normalized row-wise (per source entity) using min-max scaling. Rows with zero variance had all their scores set to 0.

6.  **Fusion**: The normalized matrices were fused into `fused_influence_matrix` using a weighted sum with the following weights:
    *   Co-occurrence: 0.4
    *   Granger: 0.4
    *   Embedding Similarity: 0.2

7.  **Sparsification**: The `fused_influence_matrix` was sparsified by:
    *   Retaining only the top **30** neighbors for each source entity.
    *   Applying a minimum weight threshold of **0.05**.
    *   The resulting `sparsified_influence_graph` contains influence edges for **5395** entities.

8.  **Output**: The final `sparsified_influence_graph` was successfully saved to `final_influence_graph.json`.

## Summary:

### Data Analysis Key Findings

*   The initial `embedding_similarity_matrix` contained 3413 unique entities.
*   A comprehensive `master_entity_index` was created, encompassing 5396 unique entities from all three input matrices (Granger, embedding similarity, and co-occurrence).
*   All three signals were successfully aligned to this master index, with missing values set to 0.
*   Each aligned matrix underwent row-wise (per source entity) min-max normalization. Rows with zero variance had all their scores set to 0.
*   The normalized matrices were fused using a weighted sum with the following coefficients: co-occurrence: 0.4, Granger: 0.4, and embedding similarity: 0.2.
*   The fused influence matrix was sparsified by retaining the top 30 neighbors for each source entity and applying a minimum weight threshold of 0.05.
*   The resulting `sparsified_influence_graph` contains influence edges for 5395 entities.

### Insights or Next Steps

*   The successful fusion of diverse influence signals (temporal causality, semantic similarity, and co-occurrence) into a single graph provides a more comprehensive view of entity relationships than any single signal alone.
*   Further analysis could involve evaluating the quality and utility of the fused graph for downstream tasks, such as link prediction or community detection, and experimenting with different weighting schemes or sparsification parameters to optimize performance.
