<a href="https://colab.research.google.com/github/leerazo/neo4j_demos/blob/main/neo4j_vertex_paysim_demo_setup_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
!pip install --quiet --upgrade graphdatascience==1.0.0

In [27]:
!pip install --quiet google-cloud-storage

In [28]:
!pip install --quiet google.cloud.aiplatform

In [29]:
!pip install python-dotenv

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [30]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [1]:
import pandas as pd
from graphdatascience import GraphDataScience

In [2]:
from dotenv import load_dotenv
import os

In [3]:
dotenv_file = "/content/20220914-vertex-demos-credentials-8cdacd95.env"
load_dotenv(dotenv_file)

DB_URL = os.getenv("NEO4J_URI")
DB_USER = os.getenv("NEO4J_USERNAME")
DB_PASS = os.getenv("NEO4J_PASSWORD")
DB_INSTANCE = os.getenv("AURA_INSTANCENAME")
DB_NAME = "neo4j"

#DB_URL = "neo4j+s://8cdacd95.databases.neo4j.io"
#DB_USER = "neo4j"
#DB_PASS = "gcVz3ay2wJ-8AnmClLK8eEZxUBuquKaWJggRknAyeC0"

print("DB_URL:", DB_URL)
print("DB_USER:", DB_USER)
print("DB_PASS:", DB_PASS)
print("DB_NAME:", DB_NAME)

DB_URL: neo4j+s://8cdacd95.databases.neo4j.io
DB_USER: neo4j
DB_PASS: gcVz3ay2wJ-8AnmClLK8eEZxUBuquKaWJggRknAyeC0
DB_NAME: neo4j


In [4]:
# If you are connecting the client to an AuraDS instance, you can get the recommended non-default configuration settings of the Python Driver applied automatically. To achieve this, set the constructor argument aura_ds=True
gds = GraphDataScience(DB_URL, auth=(DB_USER, DB_PASS), aura_ds=True)

In [5]:
gds.set_database(DB_NAME)

In [6]:
graph_name = "client_graph"

projection_list = gds.graph.list()
if len(projection_list) > 0:
  if graph_name in projection_list["graphName"][0]:
    print("Projection \"" + graph_name + "\" already exists")
    display(projection_list)
else:
  print('Creating graph projection.')
  G, results = gds.graph.project.cypher(
    "client_graph",
    "MATCH (c:Client) RETURN id(c) as id, c.num_transactions as num_transactions, c.total_transaction_amnt as total_transaction_amnt, c.is_fraudster as is_fraudster",
    'MATCH (c:Client)-[:PERFORMED]->(t:Transaction)-[:TO]->(c2:Client) return id(c) as source, id(c2) as target, sum(t.amount) as amount, "TRANSACTED_WITH" as type ',
)

display(results)

Creating graph projection.


nodeQuery            MATCH (c:Client) RETURN id(c) as id, c.num_tra...
relationshipQuery    MATCH (c:Client)-[:PERFORMED]->(t:Transaction)...
graphName                                                 client_graph
nodeCount                                                        11270
relationshipCount                                                26035
projectMillis                                                      398
Name: 0, dtype: object

In [7]:
print("Generating embeddings using FastRP")
results = gds.fastRP.mutate(
    G,
    relationshipWeightProperty="amount",
    iterationWeights=[0.0, 1.00, 1.00, 0.80, 0.60],
    featureProperties=["num_transactions", "total_transaction_amnt"],
    propertyRatio=0.25,
    nodeSelfInfluence=0.15,
    embeddingDimension=16,
    randomSeed=1,
    mutateProperty="embedding",
)

display(results)

Generating embeddings using FastRP


nodePropertiesWritten                                                11270
mutateMillis                                                             0
nodeCount                                                            11270
preProcessingMillis                                                      0
computeMillis                                                           14
configuration            {'nodeSelfInfluence': 0.15, 'relationshipWeigh...
Name: 0, dtype: object

In [8]:
# Dump it out into a dataframe
node_properties = gds.graph.streamNodeProperties(
    G, ["embedding", "num_transactions", "total_transaction_amnt", "is_fraudster"]
)

node_properties.head()  

Unnamed: 0,nodeId,nodeProperty,propertyValue
0,0,embedding,"[0.0, -3.641206660631724e-07, 0.0, 0.0, -3.641..."
1,0,num_transactions,4
2,0,total_transaction_amnt,118919.119986
3,0,is_fraudster,1
4,3,embedding,"[0.0, 0.0, 0.0, -0.07500000298023224, 0.0, 0.0..."


In [9]:
# Pivot the dataframe to better represent our classification problem
x = node_properties.pivot(
    index="nodeId", columns="nodeProperty", values="propertyValue"
)
x = x.reset_index()
x.columns.name = None
x.head()

Unnamed: 0,nodeId,embedding,is_fraudster,num_transactions,total_transaction_amnt
0,0,"[0.0, -3.641206660631724e-07, 0.0, 0.0, -3.641...",1,4,118919.119986
1,3,"[0.0, 0.0, 0.0, -0.07500000298023224, 0.0, 0.0...",-9223372036854775808,0,0.0
2,5,"[-4.998395475297457e-09, 5.79870196304455e-09,...",1,80,7484459.618642
3,8,"[0.0, 0.0, 0.0, 0.06708204001188278, 0.0, 0.06...",-9223372036854775808,0,0.0
4,10,"[0.02352503128349781, -0.023524967953562737, 2...",1,227,37580636.156865


In [10]:
# is_fraudster will have a value of 0 or 1 if populated. 
# If the value is -9223372036854775808 then it's unlabeled, so we're going to drop it.
x = x.loc[x["is_fraudster"] != -9223372036854775808]
x.head()

Unnamed: 0,nodeId,embedding,is_fraudster,num_transactions,total_transaction_amnt
0,0,"[0.0, -3.641206660631724e-07, 0.0, 0.0, -3.641...",1,4,118919.119986
2,5,"[-4.998395475297457e-09, 5.79870196304455e-09,...",1,80,7484459.618642
4,10,"[0.02352503128349781, -0.023524967953562737, 2...",1,227,37580636.156865
6,15,"[-0.005754098296165466, -4.570129021885805e-05...",0,106,4864282.016093
7,18,"[0.0, 0.0, -0.06708204001188278, 0.0, 0.0, -0....",0,0,0.0


In [11]:
# NOTE TO SELF: THIS IS ACTUALLY A REALLY IMPORTANT CELL TO DIG DEEPER
# Note that the embedding row is an array. To make this dataset more consumable, we should flatten that out into multiple individual features: embedding_0, embedding_1, ... embedding_n.
FEATURES_FILENAME = "features.csv"

embeddings = pd.DataFrame(x["embedding"].values.tolist()).add_prefix("embedding_")
merged = x.drop(columns=["embedding"]).merge(
    embeddings, left_index=True, right_index=True
)
features_df = merged.drop(
    columns=["is_fraudster", "num_transactions", "total_transaction_amnt"]
)
train_df = merged.drop(columns=["nodeId"])

features_df.to_csv(FEATURES_FILENAME, index=False)

In [12]:
TRAINING_FILENAME = "train.csv"

pd.concat([train_df for i in range(10)]).to_csv(TRAINING_FILENAME, index=False)


In [13]:
# GCP related variables!
PROJECT_ID = "neo4jbusinessdev"
STORAGE_BUCKET = "leerazo-vertex-demo"

# You can leave these defaults
REGION = "us-central1"
STORAGE_PATH = "paysim"
EMBEDDING_DIMENSION = 16
FEATURESTORE_ID = "paysim-baked-earlier"
ENTITY_NAME = "payer" 

In [14]:
os.environ["GCLOUD_PROJECT"] = PROJECT_ID
print("Setting GCP project to:", os.environ["GCLOUD_PROJECT"])

Setting GCP project to: neo4jbusinessdev


In [15]:
# Authenticate to GCP
try:
    from google.colab import auth as google_auth

    google_auth.authenticate_user()
except:
    pass

In [16]:
from google.cloud import storage

client = storage.Client()

In [17]:
bucket_list = []
for bucket in client.list_buckets():
  bucket_list.append(bucket.name)

if STORAGE_BUCKET in bucket_list:
  print('Bucket', STORAGE_BUCKET, 'already exists.')
else:
  print('Creating new storage bucket:', STORAGE_BUCKET)
  try: 
    client.create_bucket(STORAGE_BUCKET)
  except Exception as excpt:
    print(excpt)


Bucket leerazo-vertex-demo already exists.


In [18]:
# Upload our files to that bucket
demo_bucket = client.bucket(STORAGE_BUCKET)
print("FEATURES_FILENAME:", FEATURES_FILENAME)
print("TRAINING_FILENAME:", TRAINING_FILENAME)
print("STORAGE_PATH:", STORAGE_PATH)
print("demo_bucket:", demo_bucket)
for filename in [FEATURES_FILENAME, TRAINING_FILENAME]:
    upload_path = os.path.join(STORAGE_PATH, filename)
    blob = demo_bucket.blob(upload_path)
    blob.upload_from_filename(filename)

FEATURES_FILENAME: features.csv
TRAINING_FILENAME: train.csv
STORAGE_PATH: paysim
demo_bucket: <Bucket: leerazo-vertex-demo>


In [19]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION)

dataset = aiplatform.TabularDataset.create(
    display_name="paysim-baked-earlier",
    gcs_source=os.path.join("gs://", STORAGE_BUCKET, STORAGE_PATH, TRAINING_FILENAME),
)
dataset.wait()

print(f'\tDataset: "{dataset.display_name}"')
print(f'\tname: "{dataset.resource_name}"')

Creating TabularDataset


INFO:google.cloud.aiplatform.datasets.dataset:Creating TabularDataset


Create TabularDataset backing LRO: projects/803648085855/locations/us-central1/datasets/9216583652064886784/operations/2296796571137736704


INFO:google.cloud.aiplatform.datasets.dataset:Create TabularDataset backing LRO: projects/803648085855/locations/us-central1/datasets/9216583652064886784/operations/2296796571137736704


TabularDataset created. Resource name: projects/803648085855/locations/us-central1/datasets/9216583652064886784


INFO:google.cloud.aiplatform.datasets.dataset:TabularDataset created. Resource name: projects/803648085855/locations/us-central1/datasets/9216583652064886784


To use this TabularDataset in another session:


INFO:google.cloud.aiplatform.datasets.dataset:To use this TabularDataset in another session:


ds = aiplatform.TabularDataset('projects/803648085855/locations/us-central1/datasets/9216583652064886784')


INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TabularDataset('projects/803648085855/locations/us-central1/datasets/9216583652064886784')


	Dataset: "paysim-baked-earlier"
	name: "projects/803648085855/locations/us-central1/datasets/9216583652064886784"


In [20]:
embedding_column_names = ["embedding_{}".format(i) for i in range(EMBEDDING_DIMENSION)]
other_column_names = ["num_transactions", "total_transaction_amnt"]
all_columns = other_column_names + embedding_column_names
column_specs = {column: "numeric" for column in all_columns}

job = aiplatform.AutoMLTabularTrainingJob(
    display_name="train-paysim-automl-1-baked-earlier",
    optimization_prediction_type="classification",
    column_specs=column_specs,
)

In [None]:
model_display_name = "paysim-prediction-model-baked-earlier"

model = job.run(
    dataset=dataset,
    target_column="is_fraudster",
    training_fraction_split=0.8,
    validation_fraction_split=0.1,
    test_fraction_split=0.1,
    model_display_name=model_display_name,
    disable_early_stopping=False,
    budget_milli_node_hours=1000,
)

View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/1991170752803504128?project=803648085855


INFO:google.cloud.aiplatform.training_jobs:View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/1991170752803504128?project=803648085855


AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/803648085855/locations/us-central1/trainingPipelines/1991170752803504128 current state:
PipelineState.PIPELINE_STATE_RUNNING


In [None]:
# TEMPORARY
from google.cloud import aiplatform

# GCP related variables!
PROJECT_ID = "neo4jbusinessdev"
STORAGE_BUCKET = "leerazo-vertex-demo"

# You can leave these defaults
REGION = "us-central1"
STORAGE_PATH = "paysim"
EMBEDDING_DIMENSION = 16
FEATURESTORE_ID = "paysim-baked-earlier"
ENTITY_NAME = "payer" 

#model_display_name = "paysim-prediction-model-baked-earlier"
model_display_name = "paysim-prediction-model"

os.environ["GCLOUD_PROJECT"] = PROJECT_ID
print("Setting GCP project to:", os.environ["GCLOUD_PROJECT"])

# Authenticate to GCP
try:
    from google.colab import auth as google_auth

    google_auth.authenticate_user()
except:
    pass

aiplatform.init(project=PROJECT_ID, location=REGION)

model = aiplatform.Model('/projects/my-project/locations/us-central1/models/' + model_display_name)
print(model)

NameError: ignored