In [None]:
!bash setup.sh

import sagemaker
from sagemaker_graph_fraud_detection import config, container_build

role = config.role
sess = sagemaker.Session()

In [None]:
raw_data_location = 's3://sagemaker-solutions-us-west-2/German-risk/data'

session_prefix = 'dgl-german-risk'
input_data = 's3://{}/{}/{}'.format(config.solution_bucket, session_prefix, config.s3_data_prefix)

!aws s3 cp --recursive $raw_data_location $input_data

# Set S3 locations to store processed data for training and post-training results and artifacts respectively
train_data = 's3://{}/{}/{}'.format(config.solution_bucket, session_prefix, config.s3_processing_output)
train_output = 's3://{}/{}/{}'.format(config.solution_bucket, session_prefix, config.s3_train_output)

In [None]:
!pygmentize data-preprocessing/container/Dockerfile

In [None]:
region = config.region_name
account_id = config.aws_account
ecr_repository = config.ecr_repository

if config.container_build_project == "local":
    !cd  data-preprocessing && bash container/build_and_push.sh $ecr_repository $region $account_id
else:
    container_build.build(config.container_build_project)
ecr_repository_uri = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account_id, region, ecr_repository)

In [None]:
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput

script_processor = ScriptProcessor(command=['python3'],
                                   image_uri=ecr_repository_uri,
                                   role=role,
                                   instance_count=1,
                                   instance_type='ml.m4.xlarge')

script_processor.run(code='data-preprocessing/graph_data_preprocessor_german_risk.py',
                     inputs=[ProcessingInput(source=input_data,
                                             destination='/opt/ml/processing/input')],
                     outputs=[ProcessingOutput(destination=train_data,
                                               source='/opt/ml/processing/output')],
                     arguments=['--transactions','german_transaction.csv','--identity','german_identity.csv','--cat-cols','Purpose'])

In [None]:
from os import path
from sagemaker.s3 import S3Downloader
processed_files = S3Downloader.list(train_data)
print("===== Processed Files =====")
print('\n'.join(processed_files))

# optionally download processed data
# S3Downloader.download(train_data, train_data.split("/")[-1])


In [None]:
edges = ",".join(map(lambda x: x.split("/")[-1], [file for file in processed_files if "relation" in file]))
params = {'nodes' : 'features.csv',
          'edges': 'relation*',
          'labels': 'tags.csv',
          'model': 'rgcn',
          'num-gpus': 1,
          'batch-size': 10000,
          'embedding-size': 64,
          'n-neighbors': 1000,
          'n-layers': 2,
          'n-epochs': 10,
          'optimizer': 'adam',
          'lr': 1e-2
        }

print("Graph will be constructed using the following edgelists:\n{}" .format('\n'.join(edges.split(","))))

In [None]:
from sagemaker.mxnet import MXNet
from time import strftime, gmtime

estimator = MXNet(entry_point='train_dgl_mxnet_entry_point.py',
                  source_dir='sagemaker_graph_german_risk/dgl_german_risk',
                  role=role, 
                  train_instance_count=1, 
                  train_instance_type='ml.p3.2xlarge',
                  framework_version="1.6.0",
                  py_version='py3',
                  hyperparameters=params,
                  output_path=train_output,
                  code_location=train_output,
                  sagemaker_session=sess)

training_job_name = "{}-{}".format(config.solution_prefix, strftime("%Y-%m-%d-%H-%M-%S", gmtime()))
estimator.fit({'train': train_data}, job_name=training_job_name)