# Temporal Node Embedding External Approach

#### Import the required libraries
First of all we have to install and import the libraries that we need for the implementation of the Temporal Node Embedding.

- neo4j: The Neo4j Python driver is used to connect to the Neo4j database.
- graphdatascience: The graph datascience client is a Python client for working with the Neo4j Graph Data Science Library which is used for the in-memory graph projection and the FastRP algorithm for the embedding.
- numpy:  !!!!!

In [None]:
%pip install neo4j
%pip install graphdatascience
%pip install numpy

In [None]:
import numpy as np
import random
import datetime
from neo4j import GraphDatabase
import graphdatascience

### Configure Driver and Client

We have to configure the driver and the client for the connection to the Neo4j database. The driver is used to execute Cypher queries and the client is used to execute the Graph Data Science Library algorithms.

- Endpoint: Bolt URL of the Neo4j database
- Username: Username
- Password: Password
- database: Database where you imported the trips

In [None]:
endpoint = "neo4j://localhost:7687"
username = "neo4j"
password = "#Bachelorarbeit"
database = "neo4j"

gds = graphdatascience.GraphDataScience(endpoint=endpoint, auth=(username, password))
gds.set_database(database)

db_driver = GraphDatabase.driver(endpoint, auth=(username,password))

### Function for computing Time Embeddings from Timestamps for Station

Hier erklären !!!!

In [None]:
def random_timestamp_for_station(station_id, suffix="start"):
    random.seed(f"{station_id}_{suffix}")  # unterscheidet start und endzeitpunkt für den randomseed damit ergebnis reproduzierbar ist

    year = 2017
    month = random.randint(1, 12)
    day = random.randint(1, 28)
    hour = random.randint(0, 23)
    minute = random.randint(0, 59)
    second = random.randint(0, 59)

    return datetime.datetime(year, month, day, hour, minute, second)

In [None]:
def timestamp_to_embedding(timestamp):
    try:
        d = timestamp.to_native()  # für Neo4j-Typen
    except AttributeError:
        d = timestamp # für stationsembedding

    unix_timestamp = int(d.timestamp())
    dt = datetime.datetime.fromtimestamp(unix_timestamp)

    hour_sin = np.sin(2* np.pi * dt.hour /24)
    hour_cos = np.cos(2* np.pi * dt.hour /24)

    weekday_sin = np.sin(2* np.pi * dt.weekday() / 7)
    weekday_cos = np.cos(2* np.pi * dt.weekday() / 7)

    day_sin = np.sin(2* np.pi * dt.day / 7)
    day_cos = np.cos(2* np.pi * dt.day / 7)

    month_sin = np.sin(2* np.pi * dt.month / 12)
    month_cos = np.cos(2* np.pi * dt.month / 12)

    is_weekend = 1 if dt.weekday() >5 else 0

    day_of_year = d.timetuple().tm_yday
    unix_scaled = unix_timestamp / 1e9


    return np.array([hour_sin, hour_cos, weekday_sin, weekday_cos, day_sin, day_cos,month_sin, month_cos, is_weekend, day_of_year, unix_scaled])

In [None]:
def write_station_embeddings_batchwise(driver, batch_size=500):
    with driver.session(database=database) as session:
        result = session.run("""
            MATCH (s:Station)
            RETURN id(s) AS station_id
        """)

        batch = []
        count = 0
        for record in result:
            sid = record["station_id"]

            try:
                start_dt = random_timestamp_for_station(sid, "start")
                end_dt = random_timestamp_for_station(sid, "end")

                startTimeStation_emb = timestamp_to_embedding(start_dt)
                endTimeStation_emb = timestamp_to_embedding(end_dt)

                batch.append({
                    "station_id": sid,
                    "start": startTimeStation_emb.tolist(),
                    "end": endTimeStation_emb.tolist()
                })

            except Exception as e:
                print(f"Skipping station {sid}: {e}")

            if len(batch) >= batch_size:
                _send_station_embedding_batch(driver, batch)
                count += len(batch)
                print(f"{count} stations processed.")
                batch = []

        if batch:
            _send_station_embedding_batch(driver, batch)

def _send_station_embedding_batch(driver, batch):
    query = """
    CALL apoc.periodic.iterate(
      'UNWIND $batch AS row RETURN row',
      '
      MATCH (s:Station) WHERE id(s) = row.station_id
      SET s.startTimeEmbedding = row.start,
          s.endTimeEmbedding = row.end
      ',
      {batchSize: 100, parallel: true, params: {batch: $batch}}
    )
    """
    with driver.session(database=database) as session:
        session.run(query, batch=batch)


write_station_embeddings_batchwise(db_driver)

In [None]:
def write_embeddings_batchwise(driver, batch_size=500):
    with driver.session(database=database) as session:
        result = session.run("""
            MATCH (t:Trip)
            WHERE t.validFrom IS NOT NULL
            RETURN id(t) AS node_id, t.validFrom AS ts
        """)

        batch = []
        batchesSended = 0
        for record in result:

            try:
                emb = timestamp_to_embedding(record["ts"])
                batch.append({"node_id": record["node_id"], "embedding": emb.tolist()})
            except Exception as e:
                print(f"Skipping node {record['node_id']}: {e}")

            # Wenn Stapel voll ist, sende ihn
            if len(batch) >= batch_size:
                _send_embedding_batch(driver, batch)
                batchesSended += 500
                print(batchesSended)
                batch = []

        # Letzten Rest senden
        if batch:
            _send_embedding_batch(driver, batch)

def _send_embedding_batch(driver, batch):
    query = """
    CALL apoc.periodic.iterate(
      'UNWIND $batch AS row RETURN row',
      '
      MATCH (t:Trip) WHERE id(t) = row.node_id
      SET t.startTimeEmbedding = row.embedding
      ',
      {batchSize: 100, parallel: true, params: {batch: $batch}}
    )
    """
    with driver.session(database=database) as session:
        session.run(query, batch=batch)



In [None]:
write_embeddings_batchwise(db_driver)

In [None]:
projection_query  = """
MATCH (source)-[r:HAS_START|HAS_END]->(target)
WHERE source:Trip AND target:Station
WITH gds.graph.project(
  'externalGraph',
  source,
  target,
  {
    sourceNodeProperties: source {
      externalTimeEmbedding: source.startTimeEmbedding
    },
    targetNodeProperties: target {
     externalTimeEmbedding: target.startTimeEmbedding
    }},
  {undirectedRelationshipTypes: ['*']}
) AS g
RETURN g.graphName AS graph, g.nodeCount AS nodes, g.relationshipCount AS rels
"""


In [None]:
with db_driver.session(database=database) as session:
        session.run(projection_query)

In [None]:
G = gds.graph.get("externalGraph")

In [None]:
gds.fastRP.write.estimate(
    G,
    writeProperty="externalEmbedding",
    randomSeed = 42,
    embeddingDimension= 128,
    nodeSelfInfluence = 1.0,
    propertyRatio = 0.5,
    featureProperties = ['externalTimeEmbedding'],
    iterationWeights = [1.0]
)

In [None]:
#18min
gds.fastRP.write(
    G,
    writeProperty="externalEmbedding",
    randomSeed = 42,
    embeddingDimension= 128,
    nodeSelfInfluence = 1.0,
    propertyRatio = 0.5,
    featureProperties = ['externalTimeEmbedding'],
    iterationWeights = [1.0]
)

In [None]:
G.drop()

In [None]:
def create_vector_index(index_name, label, property_name, vector_dimension, similarity="cosine"):
    query = f"""
    CREATE VECTOR INDEX {index_name} IF NOT EXISTS
    FOR (n:{label})
    ON (n.{property_name})
    OPTIONS {{
    indexConfig: {{
        `vector.dimensions`: {vector_dimension},
        `vector.similarity_function`: '{similarity}'
        }}
    }}
    """
    with db_driver.session(database=database) as session:
        session.run(query)
create_vector_index( 'externalIndex','Trip', 'externalEmbedding', '128')

In [None]:
gds.close()
db_driver.close()