# LMM and Graphs

Let's create a toy dataset

In [2]:
!pip install torch_geometric
import sys
import torch

from torch_geometric.data import Data

# Assume a toy dataset with 3 papers (nodes), edges, and labels
data = Data(
    x=torch.rand(3, 10),  # Random node features
    edge_index=torch.tensor([[0, 1], [1, 2]], dtype=torch.long).t().contiguous(),  # Edges (transposed for PyG)
    y=torch.tensor([0, 1, 2], dtype=torch.long),  # True labels (3 classes)
    text=["Paper A abstract about machine learning", 
          "Paper B abstract about deep learning", 
          "Paper C abstract about neural networks"],  # Text data
)

Collecting torch_geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m63.7/63.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.3/1.3 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.7.0


In [3]:
num_classes = len(torch.unique(data.y))  # Number of unique classes

print(f"Dataset info:")
print(f"  Number of nodes: {data.x.size(0)}")
print(f"  Node feature dimension: {data.x.size(1)}")
print(f"  Number of edges: {data.edge_index.size(1)}")
print(f"  Number of classes: {num_classes}")
print(f"  True labels: {data.y.tolist()}")

Dataset info:
  Number of nodes: 3
  Node feature dimension: 10
  Number of edges: 2
  Number of classes: 3
  True labels: [0, 1, 2]


In [None]:
%pip install transformers

from transformers import AutoTokenizer, AutoModel
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

# 1. Define the Graph Neural Network (GNN)
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, num_classes)  # Output num_classes
        self.dropout = torch.nn.Dropout(0.2)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        return x  # Return logits (not softmax)

# 2. Define the Text Encoder (BERT-based)
class TextEncoder(torch.nn.Module):
    def __init__(self, model_name="bert-base-uncased", num_classes=3):
        super(TextEncoder, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        # Project from BERT's hidden size to number of classes
        self.classifier = torch.nn.Linear(self.model.config.hidden_size, num_classes)
        self.dropout = torch.nn.Dropout(0.1)
    
    def forward(self, texts):
        # Tokenize and encode text data
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        
        with torch.no_grad():  # Freeze BERT parameters during training
            outputs = self.model(**inputs)
        
        # Use [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        cls_embedding = self.dropout(cls_embedding)
        logits = self.classifier(cls_embedding)
        return logits  # Return logits (not softmax)


# 4. Training Loop with Bidirectional Pseudo-label Exchange
def train_prediction_alignment(data, gnn, text_encoder, num_iterations=5):
    optimizer_gnn = torch.optim.Adam(gnn.parameters(), lr=0.01)
    optimizer_text = torch.optim.Adam(text_encoder.parameters(), lr=0.0001)
    
    # Initialize with true labels for first iteration
    gnn_pseudo_labels = data.y.clone()
    llm_pseudo_labels = data.y.clone()
    
    for iteration in range(num_iterations):
        # 4.1 Train GNN using LLM pseudo-labels from previous iteration
        gnn.train()
        optimizer_gnn.zero_grad()
        gnn_logits = gnn(data.x, data.edge_index)
        gnn_loss = torch.nn.CrossEntropyLoss()(gnn_logits, llm_pseudo_labels)
        gnn_loss.backward()
        optimizer_gnn.step()
        
        # Generate new GNN pseudo-labels
        with torch.no_grad():
            gnn_pseudo_labels = torch.argmax(gnn_logits, dim=1)
        
        # 4.2 Train Text Encoder using GNN pseudo-labels
        text_encoder.train()
        optimizer_text.zero_grad()
        text_logits = text_encoder(data.text)
        llm_loss = torch.nn.CrossEntropyLoss()(text_logits, gnn_pseudo_labels)
        llm_loss.backward()
        optimizer_text.step()
        
        # Generate new LLM pseudo-labels for next iteration
        with torch.no_grad():
            llm_pseudo_labels = torch.argmax(text_logits, dim=1)
        
        print(f"Iteration {iteration+1}: GNN Loss = {gnn_loss.item():.4f}, LLM Loss = {llm_loss.item():.4f}")
        print(f"  GNN predictions: {gnn_pseudo_labels.tolist()}")
        print(f"  LLM predictions: {llm_pseudo_labels.tolist()}")

# Initialize models and train
input_dim = data.x.size(1)  # Node feature dimension
hidden_dim = 64

gnn = GNN(input_dim=input_dim, hidden_dim=hidden_dim, num_classes=num_classes)
text_encoder = TextEncoder(num_classes=num_classes)

print("Starting training...")
train_prediction_alignment(data, gnn, text_encoder, num_iterations=5)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Starting training...
Iteration 1: GNN Loss = 1.0962, LLM Loss = 0.8473
  GNN predictions: [0, 0, 1]
  LLM predictions: [0, 0, 0]
Iteration 2: GNN Loss = 1.1164, LLM Loss = 1.1368
  GNN predictions: [1, 1, 1]
  LLM predictions: [0, 0, 0]
Iteration 3: GNN Loss = 0.8742, LLM Loss = 0.6527
  GNN predictions: [0, 0, 0]
  LLM predictions: [0, 0, 0]
Iteration 4: GNN Loss = 0.5780, LLM Loss = 0.6721
  GNN predictions: [0, 0, 0]
  LLM predictions: [0, 0, 0]
Iteration 5: GNN Loss = 0.4746, LLM Loss = 0.6496
  GNN predictions: [0, 0, 0]
  LLM predictions: [0, 0, 0]


In [None]:
# Import libraries
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from torch_geometric.nn import GraphConv
from torch_geometric.data import Data

# 1. Define the GNN
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(GNN, self).__init__()
        self.conv = GraphConv(input_dim, hidden_dim)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index)

# 2. Define the Text Encoder (LLM)
class TextEncoder(torch.nn.Module):
    def __init__(self, model_name="bert-base-uncased", output_dim=128):
        super(TextEncoder, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.fc = torch.nn.Linear(self.model.config.hidden_size, output_dim)

    def forward(self, texts):
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        outputs = self.model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token embedding
        return self.fc(cls_embedding)

# 3. Contrastive Learning Objective
def contrastive_loss(graph_emb, text_emb, tau=0.1):
    sim = F.cosine_similarity(graph_emb.unsqueeze(1), text_emb.unsqueeze(0), dim=2)
    labels = torch.arange(sim.size(0)).to(sim.device)
    loss = F.cross_entropy(sim / tau, labels)
    return loss

# 4. Training Loop for Latent Space Alignment
def train_latent_alignment(data, gnn, text_encoder, epochs=10):
    optimizer = torch.optim.Adam(list(gnn.parameters()) + list(text_encoder.parameters()), lr=0.001)
    for epoch in range(epochs):
        optimizer.zero_grad()

        # Encode graph and text
        graph_emb = gnn(data.x, data.edge_index)  # Graph embeddings
        text_emb = text_encoder(data.text)  # Text embeddings

        # Compute contrastive loss
        loss = contrastive_loss(graph_emb, text_emb)
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1}: Loss = {loss.item()}")

# 5. Example Data
# Toy data with 3 products and their relationships
data = Data(
    x=torch.rand(3, 10),  # Node features
    edge_index=torch.tensor([[0, 1], [1, 2]], dtype=torch.long).t().contiguous(),  # Edges (transposed for PyG)
    text=["Product A description", "Product B description", "Product C description"],  # Text data
)

# Initialize models and train
gnn = GNN(input_dim=10, hidden_dim=128)
text_encoder = TextEncoder()
train_latent_alignment(data, gnn, text_encoder)

Epoch 1: Loss = 1.0187177658081055
Epoch 2: Loss = 0.684087336063385
Epoch 3: Loss = 1.0262978076934814
Epoch 4: Loss = 1.098049521446228
Epoch 5: Loss = 1.0942996740341187
Epoch 6: Loss = 1.092896580696106
Epoch 7: Loss = 1.0904731750488281
Epoch 8: Loss = 1.0973970890045166
Epoch 9: Loss = 1.0980571508407593
Epoch 10: Loss = 1.0973337888717651


# GraphRAG

If using Colab you can simply run the following cells.

Otherwise, if you want to use the local backend, please:
- download neo4j desktop on [docker](https://neo4j.com/docs/graph-data-science/current/installation/installation-docker/)*
- download [lm-studio](https://lmstudio.ai/) and download the minicpm-llama3-v-2_5 and nomic-embed-text model

*run docker as:


```
docker run --rm --env NEO4J_AUTH=neo4j/defaultpass -p 7474:7474 -p 7687:7687 -v $PWD/data:/data -v $PWD/plugins:/plugins --name neo4j-apoc -e NEO4J_apoc_export_file_enabled=true -e NEO4J_apoc_import_file_enabled=true -e NEO4J_apoc_import_file_use__neo4j__config=true -e NEO4J_PLUGINS=\[\"apoc-extended\"\] neo4j
```



In [None]:
import os
import sys

LLM_BACKEND = "ollama" # choose ["ollama" | "lm-studio"]
# LLM_BACKEND = "lm-studio"

assert LLM_BACKEND in ["ollama", "lm-studio"]

if LLM_BACKEND == "ollama":
  base_url = f"http://{os.environ.get('OLLAMA_HOST', 'localhost')}:11434/v1"
  api_key = "ollama"
  
  # Model selection - llama3.2 is MUCH faster than phi4
  llm_model = "llama3.2"  # Fast and efficient (3B parameters)
  # llm_model = "phi4"    # Slower but more capable (14B parameters)
  # llm_model = "gemma2"  # Alternative fast option
else:
  base_url = "http://localhost:1234/v1"
  api_key = "lm-studio"
  llm_model = "minicpm-llama3-v-2_5"

print(f"Using model: {llm_model}")

If Colab you need to download ollama and start the server - run the following cell

In [13]:
import subprocess
import time
import os

# Check if we're in Colab
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    print("Installing Ollama in Colab...")
    # Download and install Ollama
    !curl -fsSL https://ollama.com/install.sh | sh
    
    # Start Ollama server in the background
    print("Starting Ollama server...")
    process = subprocess.Popen(['ollama', 'serve'], 
                               stdout=subprocess.PIPE, 
                               stderr=subprocess.PIPE)
    
    # Wait for server to start
    time.sleep(5)
    print("Ollama server started!")
else:
    print("Not in Colab. Please ensure Ollama is installed and running:")
    print("- Download from: https://ollama.com/download")
    print("- Make sure 'ollama serve' is running in a terminal")


Installing Ollama in Colab...
>>> Cleaning up old version at /usr/local/lib/ollama
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
Starting Ollama server...
Ollama server started!


In [14]:
# Test Ollama connection and start if needed
import requests
import subprocess
import time
import os

print("Checking Ollama status...\n")

# Check if Ollama is accessible
try:
    response = requests.get('http://localhost:11434', timeout=2)
    if response.status_code == 200:
        print("‚úì Ollama server is running and accessible!")
    else:
        print(f"‚ö† Ollama responded with status: {response.status_code}")
except requests.exceptions.ConnectionError:
    print("‚úó Ollama server is NOT accessible!")
    print("\n" + "="*70)
    print("üîß FIXING: Starting Ollama server...")
    print("="*70)
    
    try:
        # Try to start Ollama in the background
        print("\nStarting 'ollama serve' in background...")
        process = subprocess.Popen(
            ['ollama', 'serve'],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            start_new_session=True
        )
        
        # Wait a moment for server to start
        print("Waiting for Ollama to start...")
        time.sleep(3)
        
        # Check again
        try:
            response = requests.get('http://localhost:11434', timeout=5)
            if response.status_code == 200:
                print("‚úì Ollama server started successfully!\n")
            else:
                print(f"‚ö† Ollama started but responded with: {response.status_code}\n")
        except:
            print("‚ö† Ollama may still be starting up...\n")
            print("If issues persist, manually run in terminal:")
            print("  ollama serve")
            
    except FileNotFoundError:
        print("\n‚úó Ollama is not installed!")
        print("\nüì• Install Ollama:")
        print("  Visit: https://ollama.com/download")
        print("  Or run the Colab setup cell above if in Colab")
    except Exception as e:
        print(f"\n‚úó Error starting Ollama: {e}")
        print("\nManually start Ollama:")
        print("  Open a new terminal and run: ollama serve")
        
except Exception as e:
    print(f"‚úó Error checking Ollama: {e}")

print("\n" + "="*70)


Checking Ollama status...

‚úì Ollama server is running and accessible!



In [None]:
%pip install ollama

import ollama
import time

print("Preparing models for LLM extraction...\n")

# First, verify Ollama is responding
try:
    print("1. Checking Ollama connectivity...")
    models_response = ollama.list()
    print("   ‚úì Ollama is accessible\n")
    
    # Check if phi4 is already available
    # Handle different response formats from ollama.list()
    model_names = []
    if isinstance(models_response, dict):
        if 'models' in models_response:
            for m in models_response['models']:
                if isinstance(m, dict) and 'name' in m:
                    model_names.append(m['name'])
                elif isinstance(m, dict) and 'model' in m:
                    model_names.append(m['model'])
    
    print(f"   Found {len(model_names)} models: {model_names}\n")
    
    # Check if the selected model is available
    model_available = any(llm_model in str(name).lower() for name in model_names)
    
    if model_available:
        print("2. Model check:")
        print(f"   ‚úì {llm_model} is already available!\n")
    else:
        print(f"2. Downloading {llm_model} model...")
        if llm_model == "llama3.2":
            print("   ‚è≥ This should be quick (~2GB download)...")
        elif llm_model == "phi4":
            print("   ‚è≥ This may take a few minutes (~8GB download)...")
        else:
            print("   ‚è≥ Downloading...")
        print("   Please be patient...\n")
        
        start = time.time()
        ollama.pull(llm_model)
        elapsed = time.time() - start
        print(f"   ‚úì {llm_model} downloaded in {elapsed:.1f} seconds\n")
    
    # Optionally pull embedding model
    print("3. Checking embedding model...")
    nomic_available = any('nomic-embed-text' in str(name).lower() for name in model_names)
    
    if nomic_available:
        print("   ‚úì nomic-embed-text is already available\n")
    else:
        print("   Downloading nomic-embed-text...")
        ollama.pull("nomic-embed-text")
        print("   ‚úì nomic-embed-text downloaded\n")
    
    print("="*70)
    print("‚úÖ All models ready!")
    print("="*70)
    
except Exception as e:
    print(f"‚úó Error: {e}\n")
    print("="*70)
    print("üîß TROUBLESHOOTING:")
    print("="*70)
    print("\n1. Is Ollama running?")
    print("   ‚Üí Run Cell 11 to start it automatically")
    print("   ‚Üí OR manually run in terminal: ollama serve\n")
    print("2. Is Ollama installed?")
    print("   ‚Üí Download from: https://ollama.com/download\n")
    print("3. Is the model stuck downloading?")
    print("   ‚Üí Check terminal where 'ollama serve' is running")
    print("   ‚Üí Try: ollama pull phi4 (in a separate terminal)")
    print("="*70)

Preparing models for LLM extraction...

1. Checking Ollama connectivity...
   ‚úì Ollama is accessible

   Found 0 models: []

2. Downloading phi4 model...
   ‚è≥ This may take a few minutes (phi4 is ~8GB)...
   Please be patient...

   ‚úì phi4 downloaded in 0.4 seconds

3. Checking embedding model...
   Downloading nomic-embed-text...
   ‚úì nomic-embed-text downloaded

‚úÖ All models ready!


## Warm Up the Model

**Important:** The first time a model loads, it needs to be loaded into memory. This cell pre-loads it:

- **llama3.2** (current): ~10-20 seconds  
- **phi4**: ~30-60 seconds


In [None]:
# Warm up the model - loads it into memory
import ollama
import time

print(f"Loading {llm_model} model into memory...")
if llm_model == "llama3.2":
    print("‚è≥ This should be quick (10-20 seconds)...\n")
elif llm_model == "phi4":
    print("‚è≥ First load may take 30-60 seconds, please be patient...\n")
else:
    print("‚è≥ Loading model, please be patient...\n")

start_time = time.time()

try:
    # Use ollama.generate which is more reliable than chat for testing
    response = ollama.generate(
        model=llm_model,
        prompt='Hello',
        options={'num_predict': 5}  # Only generate 5 tokens to be fast
    )
    
    elapsed = time.time() - start_time
    print(f"‚úì Model loaded successfully in {elapsed:.1f} seconds!")
    print(f"Model response: {response['response'][:50]}...\n")
    
    print("="*70)
    print(f"‚úÖ {llm_model} is ready to use!")
    print("="*70)
    
except Exception as e:
    elapsed = time.time() - start_time
    print(f"‚úó Model failed to load after {elapsed:.1f} seconds")
    print(f"Error: {e}\n")
    
    print("="*70)
    print("üîß TROUBLESHOOTING:")
    print("="*70)
    print("\n1. Check if model is downloaded:")
    print("   ‚Üí Run: ollama list (in terminal)")
    print(f"   ‚Üí If {llm_model} is missing, run: ollama pull {llm_model}\n")
    print("2. Model might be corrupted:")
    print(f"   ‚Üí Run: ollama rm {llm_model}")
    print(f"   ‚Üí Then: ollama pull {llm_model}\n")
    print("3. Try the model directly:")
    print(f"   ‚Üí In terminal, run: ollama run {llm_model}")
    print("   ‚Üí Type 'hello' and see if it responds")
    print("="*70)


Loading phi4 model into memory...
‚è≥ First load may take 30-60 seconds, please be patient...

‚úì Model loaded successfully in 57.4 seconds!
Model response: Hello! How can I...

‚úÖ phi4 is ready to use!


---

## üìã Steps to Use Option A (LLM Extraction)

Since you want to use LLM extraction, follow these steps in order:

1. **Cell 8**: Model selection ‚úì (now using llama3.2 - much faster!)
2. **Cell 11**: Check/Start Ollama ‚úì  
3. **Cell 12**: Download llama3.2 model (if not already downloaded)
4. **Cell 14**: üëà **START HERE** - Warm up the model (~10-20 seconds)
5. **Cell 20**: Test LLM connection  
6. **Cell 26**: Run Option A extraction (1-3 minutes with llama3.2)

**Current model:** llama3.2 (3B parameters, fast and efficient)

**Want more accuracy?** Change Cell 8 to use phi4, but expect longer wait times (2-5 min for extraction)

---


In [21]:
# Setup Neo4j for Colab or local environment
import subprocess
import time

# Check if we're in Colab
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    print("Setting up Neo4j in Colab...")
    # Install Neo4j driver
    %pip install neo4j
    
    # Note: For full Neo4j in Colab, you'd typically use a cloud instance
    # or a temporary docker container. For simplicity, we'll show instructions
    print("\n‚ö†Ô∏è  Neo4j Setup Required:")
    print("For Colab, you have two options:")
    print("1. Use Neo4j AuraDB (free cloud instance): https://neo4j.com/cloud/aura/")
    print("2. Use a temporary Neo4j sandbox: https://sandbox.neo4j.com/")
    print("\nThen update NEO4J_URI, NEO4J_USER, and NEO4J_PASSWORD in the next cell")
else:
    print("Local environment detected.")
    print("\nüìã Neo4j Setup Instructions:")
    print("\nOption 1: Using Docker (Recommended):")
    print("Run this command in your terminal:")
    print('docker run --rm --env NEO4J_AUTH=neo4j/defaultpass -p 7474:7474 -p 7687:7687 --name neo4j neo4j:latest')
    print("\nOption 2: Download Neo4j Desktop:")
    print("https://neo4j.com/download/")
    print("\nOption 3: Use Neo4j AuraDB (Cloud, free tier):")
    print("https://neo4j.com/cloud/aura/")


Setting up Neo4j in Colab...

‚ö†Ô∏è  Neo4j Setup Required:
For Colab, you have two options:
1. Use Neo4j AuraDB (free cloud instance): https://neo4j.com/cloud/aura/
2. Use a temporary Neo4j sandbox: https://sandbox.neo4j.com/

Then update NEO4J_URI, NEO4J_USER, and NEO4J_PASSWORD in the next cell


In [22]:
# Test Neo4j connection
import socket
import os

def test_neo4j_connection(host='localhost', port=7687):
    """Test if Neo4j is accessible on the given host and port"""
    try:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(2)
        result = sock.connect_ex((host, port))
        sock.close()
        return result == 0
    except Exception as e:
        return False

# Check if we're in Colab
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

host = os.environ.get("NEO4J_HOST", "localhost")

if test_neo4j_connection(host):
    print(f"‚úì Neo4j is accessible on {host}:7687")
    print("You can proceed to the next cell!")
else:
    print(f"‚úó Cannot connect to Neo4j on {host}:7687")
    print("\n" + "="*70)
    print("‚ö†Ô∏è  IMPORTANT: Neo4j is NOT running - you must set it up first!")
    print("="*70)
    
    if IN_COLAB:
        print("\nüåê COLAB USERS: You cannot run Neo4j locally in Colab.")
        print("\nYou MUST use a cloud Neo4j instance. Choose ONE option:\n")
        print("Option 1: Neo4j AuraDB (FREE, Recommended)")
        print("  1. Go to: https://neo4j.com/cloud/aura/")
        print("  2. Sign up for free account")
        print("  3. Create a new FREE instance")
        print("  4. Save the credentials they give you")
        print("  5. In the next cell, update:")
        print("     NEO4J_URI = 'neo4j+s://xxxxx.databases.neo4j.io'")
        print("     NEO4J_USER = 'neo4j'")
        print("     NEO4J_PASSWORD = 'your-password-from-aura'")
        print("\nOption 2: Neo4j Sandbox (Temporary)")
        print("  1. Go to: https://sandbox.neo4j.com/")
        print("  2. Create a blank sandbox")
        print("  3. Get connection details")
        print("  4. Update connection info in next cell")
    else:
        print("\nüñ•Ô∏è  LOCAL USERS: Start Neo4j with Docker\n")
        print("Run this in your terminal:")
        print("‚îÄ" * 70)
        print("docker run --rm --env NEO4J_AUTH=neo4j/defaultpass \\")
        print("  -p 7474:7474 -p 7687:7687 --name neo4j neo4j:latest")
        print("‚îÄ" * 70)
        print("\nOR download Neo4j Desktop: https://neo4j.com/download/")
    
    print("\n‚ö†Ô∏è  DO NOT run the next cell until Neo4j is set up and this cell shows ‚úì")
    print("="*70)


‚úó Cannot connect to Neo4j on localhost:7687

‚ö†Ô∏è  IMPORTANT: Neo4j is NOT running - you must set it up first!

üåê COLAB USERS: You cannot run Neo4j locally in Colab.

You MUST use a cloud Neo4j instance. Choose ONE option:

Option 1: Neo4j AuraDB (FREE, Recommended)
  1. Go to: https://neo4j.com/cloud/aura/
  2. Sign up for free account
  3. Create a new FREE instance
  4. Save the credentials they give you
  5. In the next cell, update:
     NEO4J_URI = 'neo4j+s://xxxxx.databases.neo4j.io'
     NEO4J_USER = 'neo4j'
     NEO4J_PASSWORD = 'your-password-from-aura'

Option 2: Neo4j Sandbox (Temporary)
  1. Go to: https://sandbox.neo4j.com/
  2. Create a blank sandbox
  3. Get connection details
  4. Update connection info in next cell

‚ö†Ô∏è  DO NOT run the next cell until Neo4j is set up and this cell shows ‚úì


In [18]:
ollama.list()

ListResponse(models=[Model(model='nomic-embed-text:latest', modified_at=datetime.datetime(2025, 11, 18, 17, 51, 28, 820938, tzinfo=TzInfo(UTC)), digest='0a109f422b47e3a30ba2b10eca18548e944e8a23073ee3f3e947efcf3c45e59f', size=274302450, details=ModelDetails(parent_model='', format='gguf', family='nomic-bert', families=['nomic-bert'], parameter_size='137M', quantization_level='F16')), Model(model='phi4:latest', modified_at=datetime.datetime(2025, 11, 18, 17, 51, 28, 594939, tzinfo=TzInfo(UTC)), digest='ac896e5b8b34a1f4efa7b14d7520725140d5512484457fab45d2a4ea14c69dba', size=9053116391, details=ModelDetails(parent_model='', format='gguf', family='phi3', families=['phi3'], parameter_size='14.7B', quantization_level='Q4_K_M'))])

# GraphRAG Section

> **üìù Note:** The GraphRAG section requires a running Neo4j database. 
>
> **For Colab users:** You'll need to set up a free Neo4j AuraDB instance (instructions below).
>
> **Want to skip this section?** The first two sections (Prediction Alignment & Latent Space Alignment) work without Neo4j and demonstrate core LLM+Graph concepts!


# Neo4j

## ‚öôÔ∏è Configure Neo4j Connection

**IMPORTANT:** Update these settings based on your Neo4j setup:

### For Local Docker (default settings):
- Keep the defaults below (no changes needed)

### For Neo4j AuraDB (Colab users):
- Update `NEO4J_URI` to your AuraDB URI (e.g., `neo4j+s://xxxxx.databases.neo4j.io`)
- Update `NEO4J_PASSWORD` to your AuraDB password
- Keep `NEO4J_USER` as `neo4j`

### For Neo4j Sandbox:
- Update all three variables with your sandbox credentials


## ‚ö° Quick Test: Check LLM Connection

Before running the full graph extraction (which can be slow), let's test if the LLM is responding:


In [23]:
# Quick test of LLM connection
import requests
from langchain_openai import ChatOpenAI

print("Testing LLM connection...")
print(f"Backend: {LLM_BACKEND}")
print(f"Model: {llm_model}")
print(f"Base URL: {base_url}\n")

# First check if Ollama server is accessible
try:
    print("1. Checking if Ollama server is accessible...")
    response = requests.get('http://localhost:11434', timeout=5)
    if response.status_code == 200:
        print("   ‚úì Ollama server is running\n")
    else:
        print(f"   ‚ö† Ollama server responded with status: {response.status_code}\n")
except requests.exceptions.ConnectionError:
    print("   ‚úó Ollama server is NOT running!")
    print("   ‚Üí Start Ollama: Open terminal and run 'ollama serve'\n")
except Exception as e:
    print(f"   ‚úó Error: {e}\n")

# Then test if the model responds via LangChain
try:
    print("2. Testing if model responds via LangChain...")
    print("   (This uses the same interface as the graph extraction)\n")
    
    test_llm = ChatOpenAI(
        temperature=0,
        model_name=llm_model,
        base_url=base_url,
        api_key=api_key,
        timeout=60,  # 60 second timeout for first load
        max_retries=1
    )
    
    print("   Sending test query...")
    response = test_llm.invoke("Say 'hello' in one word")
    print(f"   ‚úì Model is responding!")
    print(f"   Response: {response.content}\n")
    print("=" * 70)
    print("‚úÖ LLM is working - you can proceed with Option A (LLM extraction)")
    print("=" * 70)
    print("\nNote: The graph extraction in Cell 22 may still take 2-5 minutes")
    print("because it needs to analyze longer text and make multiple LLM calls.")
    
except Exception as e:
    print(f"   ‚úó Model test failed: {e}\n")
    print("=" * 70)
    print("‚ö†Ô∏è  LLM is NOT working properly")
    print("=" * 70)
    print("\nüîß Try these fixes:")
    print("1. Did you run Cell 14 to warm up the model?")
    print("   ‚Üí Go back and run Cell 14 first")
    print("2. Check if phi4 is properly installed:")
    print("   ‚Üí Open terminal and run: ollama list")
    print("   ‚Üí If phi4 is missing: ollama pull phi4")
    print("3. Try using ollama directly:")
    print("   ‚Üí Open terminal and run: ollama run phi4")
    print("   ‚Üí Type 'hello' and see if it responds")
    print("4. Consider using a faster model:")
    print("   ‚Üí In Cell 8, change: llm_model = 'llama3.2'")
    print("   ‚Üí Then: ollama pull llama3.2")
    print("=" * 70)


Testing LLM connection...
Backend: ollama
Model: phi4
Base URL: http://localhost:11434/v1

1. Checking if Ollama server is accessible...
   ‚úì Ollama server is running

2. Testing if model responds via LangChain...
   (This uses the same interface as the graph extraction)

   Sending test query...
   ‚úì Model is responding!
   Response: Hi!

‚úÖ LLM is working - you can proceed with Option A (LLM extraction)

Note: The graph extraction in Cell 22 may still take 2-5 minutes
because it needs to analyze longer text and make multiple LLM calls.


## Option A: LLM-Based Graph Extraction (Slow)

**‚ö†Ô∏è Warning:** This can take 2-5 minutes with phi4. Only run if the LLM test above succeeded.

If this cell runs too long, **interrupt it** and use Option B (manual) below instead.


In [24]:
%pip install neo4j langchain-neo4j langchain-experimental langchain-openai langchain-core

import os
from neo4j import GraphDatabase
from langchain_neo4j import Neo4jGraph

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# üîß CONFIGURE YOUR NEO4J CONNECTION HERE
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

# For LOCAL Docker (default):
host = os.environ.get("NEO4J_HOST", "localhost")
NEO4J_URI = "neo4j+s://edea191c.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "YMlZaZmpSVwzkjKdok57WI7PoNO-4YvVVfsLNvH3Ud4"

# For COLAB/AuraDB: Uncomment and update these lines:
# NEO4J_URI = "neo4j+s://xxxxx.databases.neo4j.io"  # Your AuraDB URI
# NEO4J_USER = "neo4j"
# NEO4J_PASSWORD = "your-auradb-password"  # Your AuraDB password

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

print(f"Attempting to connect to: {NEO4J_URI}")
print(f"User: {NEO4J_USER}")

try:
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
    driver.verify_connectivity()
    print(f"\n‚úì Connected to Neo4j at {NEO4J_URI}")
    
    graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PASSWORD)
    print("‚úì Neo4j Graph initialized successfully\n")
except Exception as e:
    print(f"\n‚úó Failed to connect to Neo4j")
    print(f"Error: {e}\n")
    print("=" * 70)
    print("üîß TROUBLESHOOTING STEPS:")
    print("=" * 70)
    print("\n1. Did you run Cell 14 to check if Neo4j is accessible?")
    print("   ‚Üí Go back and run Cell 14 first!\n")
    print("2. Are you in Colab?")
    print("   ‚Üí You MUST use Neo4j AuraDB (free): https://neo4j.com/cloud/aura/")
    print("   ‚Üí Then update the connection settings in this cell (lines 11-14)\n")
    print("3. Are you running locally?")
    print("   ‚Üí Start Neo4j with Docker in a terminal:")
    print("   docker run --rm --env NEO4J_AUTH=neo4j/defaultpass \\")
    print("     -p 7474:7474 -p 7687:7687 --name neo4j neo4j:latest\n")
    print("4. Using AuraDB but still failing?")
    print("   ‚Üí Check your URI format: neo4j+s://xxxxx.databases.neo4j.io")
    print("   ‚Üí Check your password is correct")
    print("=" * 70)
    raise

# ---- Step 2: Create knowledge graph from text ----
import os
from langchain_experimental.graph_transformers.llm import LLMGraphTransformer
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0,
                 model_name=llm_model,
                 base_url=base_url,
                 api_key=api_key)

llm_transformer = LLMGraphTransformer(llm=llm)

from langchain_core.documents import Document

text = """
Marie Curie, born in 1867, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.
She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields.
Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.
She was, in 1906, the first woman to become a professor at the University of Paris.
"""
documents = [Document(page_content=text)]

print("Converting text to graph documents...")
print("‚è≥ This may take 30-90 seconds as the LLM extracts entities and relationships...")
print("   The LLM is analyzing the text to identify nodes (entities) and edges (relationships)")
print("   Please be patient...\n")

import time
start_time = time.time()

try:
    graph_documents = llm_transformer.convert_to_graph_documents(documents)
    elapsed = time.time() - start_time
    print(f"‚úì Conversion completed in {elapsed:.1f} seconds\n")
    
    if graph_documents and len(graph_documents) > 0:
        print(f"üìä Extracted {len(graph_documents[0].nodes)} Nodes:")
        for i, node in enumerate(graph_documents[0].nodes[:5], 1):  # Show first 5
            print(f"  {i}. {node}")
        if len(graph_documents[0].nodes) > 5:
            print(f"  ... and {len(graph_documents[0].nodes) - 5} more\n")
        else:
            print()
        
        print(f"üîó Extracted {len(graph_documents[0].relationships)} Relationships:")
        for i, rel in enumerate(graph_documents[0].relationships[:5], 1):  # Show first 5
            print(f"  {i}. {rel}")
        if len(graph_documents[0].relationships) > 5:
            print(f"  ... and {len(graph_documents[0].relationships) - 5} more\n")
        else:
            print()
    else:
        print("‚ö†Ô∏è  No graph documents were created")
        
except Exception as e:
    elapsed = time.time() - start_time
    print(f"‚úó Conversion failed after {elapsed:.1f} seconds")
    print(f"Error: {e}\n")
    print("This might be due to:")
    print("1. Ollama/phi4 not responding properly")
    print("2. Model timeout or memory issues")
    print("3. LLM unable to parse the text")
    raise

# Add graph to neo4j
print("\nAdding graph documents to Neo4j...")
try:
    graph.add_graph_documents(graph_documents)
    print("‚úì Graph documents added successfully")
    print("\nüëâ Now skip to 'Step 3: Perform GraphRAG Queries' below")
except Exception as e:
    print(f"‚úó Failed to add graph documents: {e}")
    raise

Attempting to connect to: neo4j+s://edea191c.databases.neo4j.io
User: neo4j

‚úì Connected to Neo4j at neo4j+s://edea191c.databases.neo4j.io
‚úì Neo4j Graph initialized successfully

Converting text to graph documents...
‚è≥ This may take 30-90 seconds as the LLM extracts entities and relationships...
   The LLM is analyzing the text to identify nodes (entities) and edges (relationships)
   Please be patient...



KeyboardInterrupt: 

## Option B: Manual Graph Creation (Fast - Recommended) ‚ö°

**‚úÖ USE THIS if:**
- LLM test failed above
- Option A is too slow
- You want instant results

**No LLM needed!** This creates the graph directly - works even if Ollama isn't running!


In [11]:
# Manual graph creation (bypasses slow LLM extraction)
from langchain_core.documents import Document
from langchain_experimental.graph_transformers import GraphDocument
from langchain_community.graphs.graph_document import Node, Relationship

print("Creating graph manually...")

# Create nodes
marie = Node(id="Marie Curie", type="Person")
pierre = Node(id="Pierre Curie", type="Person")
nobel = Node(id="Nobel Prize", type="Award")
university = Node(id="University of Paris", type="Organization")
radioactivity = Node(id="Radioactivity", type="ResearchField")

# Create relationships
relationships = [
    Relationship(source=marie, target=pierre, type="MARRIED_TO"),
    Relationship(source=marie, target=nobel, type="WON"),
    Relationship(source=pierre, target=nobel, type="WON"),
    Relationship(source=marie, target=university, type="PROFESSOR_AT"),
    Relationship(source=marie, target=radioactivity, type="RESEARCHED"),
]

# Create graph document
text = """
Marie Curie, born in 1867, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.
She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields.
Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.
She was, in 1906, the first woman to become a professor at the University of Paris.
"""

graph_documents = [GraphDocument(
    nodes=[marie, pierre, nobel, university, radioactivity],
    relationships=relationships,
    source=Document(page_content=text)
)]

print(f"‚úì Created graph with {len(graph_documents[0].nodes)} nodes and {len(graph_documents[0].relationships)} relationships\n")

print("üìä Nodes:")
for i, node in enumerate(graph_documents[0].nodes, 1):
    print(f"  {i}. {node.type}: {node.id}")

print(f"\nüîó Relationships:")
for i, rel in enumerate(graph_documents[0].relationships, 1):
    print(f"  {i}. {rel.source.id} --[{rel.type}]--> {rel.target.id}")

print("\n‚úì Graph created successfully! Now adding to Neo4j...")

# Add graph to neo4j
try:
    graph.add_graph_documents(graph_documents)
    print("‚úì Graph documents added to Neo4j successfully!\n")
except Exception as e:
    print(f"‚úó Failed to add graph documents: {e}")
    raise


ImportError: cannot import name 'GraphDocument' from 'langchain_experimental.graph_transformers' (/usr/local/lib/python3.12/dist-packages/langchain_experimental/graph_transformers/__init__.py)

## Step 3: Perform GraphRAG Queries

Now that we have data in Neo4j, let's query it using natural language!


In [None]:
# Setup GraphRAG query chain
from langchain_openai import ChatOpenAI
from langchain_neo4j import GraphCypherQAChain
from langchain_core.prompts import PromptTemplate

print("Setting up GraphRAG query chain...")

# Create LLM for queries
llm = ChatOpenAI(
    temperature=0,
    model_name=llm_model,
    base_url=base_url,
    api_key=api_key
)

def escape(s):
    return s.replace("{","").replace("}","")

# Get the current schema
schema_info = graph.schema

CYPHER_GENERATION_TEMPLATE = f"""You are a Neo4j expert. Generate a Cypher query to answer the given question.

Database Schema: {escape(schema_info)}

Rules:
1. Always use explicit `MATCH` for relationships.
2. Never use `WHERE` for relationship matching.
3. Use `RETURN DISTINCT` when appropriate.

Example Queries:
1. Question: "Who won the Nobel Prize?"
   Cypher: MATCH (p:Person)-[:WON]->(:Award) RETURN p.id AS winner

Question: {{query}}
Return only the Cypher query without any explanation or additional text.
Cypher:"""

chain = GraphCypherQAChain.from_llm(
    llm=llm,
    graph=graph,
    verbose=True,
    cypher_prompt=PromptTemplate(
        input_variables=["query"],
        template=CYPHER_GENERATION_TEMPLATE
    ),
    allow_dangerous_requests=True
)

print("‚úì GraphRAG chain ready!\n")
print("Current database schema:")
print(schema_info)


In [None]:
# Test GraphRAG queries
print("Testing GraphRAG queries...\n")
print("="*70)

# Query 1
question1 = "Who was married to Marie Curie?"
print(f"\nüìù Question: {question1}")
try:
    response = chain.invoke(question1)
    print(f"‚úì Answer: {response['result']}")
except Exception as e:
    print(f"‚úó Query failed: {e}")

print("\n" + "="*70)

# Query 2
question2 = "What did Marie Curie research?"
print(f"\nüìù Question: {question2}")
try:
    response = chain.invoke(question2)
    print(f"‚úì Answer: {response['result']}")
except Exception as e:
    print(f"‚úó Query failed: {e}")

print("\n" + "="*70)

# Query 3
question3 = "Where did Marie Curie work?"
print(f"\nüìù Question: {question3}")
try:
    response = chain.invoke(question3)
    print(f"‚úì Answer: {response['result']}")
except Exception as e:
    print(f"‚úó Query failed: {e}")

print("\n" + "="*70)

# Close the driver
print("\n\nCleaning up...")
try:
    driver.close()
    print("‚úì Neo4j driver closed")
except:
    pass

print("\n‚úÖ GraphRAG demo complete!")
