# Building Knowledge Graph for Maize Toy Data

This notebook demonstrates how to build a knowledge graph from the maize.csv toy data using Neo4j.

## Setup and Imports

In [None]:
import pandas as pd
import os
from dotenv import load_dotenv
from neo4j import GraphDatabase
import warnings
warnings.filterwarnings("ignore")

class Neo4jConnection:
    """Neo4j database connection wrapper"""
    
    def __init__(self, uri, username, password, database="neo4j"):
        self.driver = GraphDatabase.driver(uri, auth=(username, password))
        self.database = database
    
    def close(self):
        if self.driver:
            self.driver.close()
    
    def query(self, cypher, params=None):
        """Execute a Cypher query and return results"""
        with self.driver.session(database=self.database) as session:
            result = session.run(cypher, params or {})
            return [record.data() for record in result]

## Load and Examine the Data

In [None]:
# Load the maize data
df = pd.read_csv('../toydata/maize.csv')
print(f"Dataset shape: {df.shape}")
print("\nFirst few rows:")
df.head()

In [None]:
# Examine the data structure
print("Unique subjects:")
print(df['subject'].unique())
print("\nUnique predicates:")
print(df['predicate'].unique())
print("\nUnique objects:")
print(df['object'].unique())

## Connect to Neo4j

In [None]:
# Load environment variables
load_dotenv('../.env', override=True)

NEO4J_URI = os.getenv('NEO4J_URI', 'bolt://localhost:7687')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME', 'neo4j')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE', 'neo4j')

print(f"Connecting to Neo4j at: {NEO4J_URI}")

# Initialize Neo4j connection
kg = Neo4jConnection(
    uri=NEO4J_URI, 
    username=NEO4J_USERNAME, 
    password=NEO4J_PASSWORD, 
    database=NEO4J_DATABASE
)

# Test connection
try:
    result = kg.query("RETURN 1 as test")
    print("Connected successfully!")
except Exception as e:
    print(f"Connection failed: {e}")

## Build the Knowledge Graph

In [None]:
# Run the build script
exec(open('../build_maize_kg.py').read())

## Query the Knowledge Graph

In [None]:
# Count all nodes
result = kg.query("MATCH (n) RETURN count(n) as total_nodes")
print(f"Total nodes in graph: {result[0]['total_nodes']}")

In [None]:
# Find all genes and what they regulate
cypher = """
MATCH (g:Gene)-[:REGULATES]->(t:Trait)
RETURN g.name as gene, t.name as trait
"""
result = kg.query(cypher)
print("Genes and their regulated traits:")
for row in result:
    print(f"  {row['gene']} → {row['trait']}")

In [None]:
# Find genotypes and their traits
cypher = """
MATCH (gt:Genotype)-[:HAS_TRAIT]->(t:Trait)
RETURN gt.name as genotype, t.name as trait
"""
result = kg.query(cypher)
print("Genotypes and their traits:")
for row in result:
    print(f"  {row['genotype']} → {row['trait']}")

In [None]:
# Find complex relationships: genes → traits ← genotypes
cypher = """
MATCH (g:Gene)-[:REGULATES]->(t:Trait)<-[:HAS_TRAIT]-(gt:Genotype)
RETURN g.name as gene, t.name as trait, gt.name as genotype
"""
result = kg.query(cypher)
print("Gene-Trait-Genotype connections:")
for row in result:
    print(f"  {row['gene']} regulates {row['trait']} found in {row['genotype']}")

In [None]:
# Find trial information
cypher = """
MATCH (trial:Trial)-[:CONDUCTED_IN]->(loc:Location)
MATCH (gt:Genotype)-[:TESTED_IN]->(trial)
MATCH (trial)-[:MEASURED]->(trait:Trait)
RETURN trial.name as trial, loc.name as location, 
       collect(DISTINCT gt.name) as genotypes_tested,
       collect(DISTINCT trait.name) as traits_measured
"""
result = kg.query(cypher)
print("Trial information:")
for row in result:
    print(f"  Trial: {row['trial']}")
    print(f"    Location: {row['location']}")
    print(f"    Genotypes tested: {', '.join(row['genotypes_tested'])}")
    print(f"    Traits measured: {', '.join(row['traits_measured'])}")

## Advanced Queries

In [None]:
# Find the path from gene to QTL through trait
cypher = """
MATCH path = (g:Gene)-[:REGULATES]->(t:Trait)-[:ASSOCIATED_WITH]->(q:QTL)
RETURN g.name as gene, t.name as trait, q.name as qtl
"""
result = kg.query(cypher)
print("Gene → Trait → QTL pathways:")
for row in result:
    print(f"  {row['gene']} → {row['trait']} → {row['qtl']}")

In [None]:
# Find QTL chromosomal locations
cypher = """
MATCH (q:QTL)-[:LOCATED_ON]->(c:Chromosome)
RETURN q.name as qtl, c.name as chromosome
"""
result = kg.query(cypher)
print("QTL chromosomal locations:")
for row in result:
    print(f"  {row['qtl']} is located on {row['chromosome']}")

In [None]:
# Environmental context
cypher = """
MATCH (loc:Location)-[:HAS_WEATHER]->(w:Weather)
RETURN loc.name as location, w.name as weather
"""
result = kg.query(cypher)
print("Environmental conditions:")
for row in result:
    print(f"  {row['location']} has {row['weather']} conditions")