## Setup 

Run this notebook to create and build the Dataflow Flex template.

In [None]:
from datetime import datetime

In [None]:
ARTIFACT_REPO = "dataflow-templates" 
REGION = "us-central1" 
PROJECT = !(gcloud config get-value core/project)
PROJECT = PROJECT[0]
STAGING_BUCKET = f"{PROJECT}-dataflow-templates"

%env REGION={REGION}
%env ARTIFACT_REPO={ARTIFACT_REPO}
%env PROJECT={PROJECT}
%env STAGING_BUCKET={STAGING_BUCKET}

Create bucket

In [None]:
!gsutil mb -l {REGION} gs://{STAGING_BUCKET}

Create artifact registry repo

In [None]:
!gcloud artifacts repositories create {ARTIFACT_REPO} \
 --repository-format=docker \
 --location={REGION}

Build the flex template

In [None]:
TAG=datetime.now().strftime("%Y%m%d-%H%M%S")
TEMPLATE_FILE = f"gs://{STAGING_BUCKET}/pdf-to-vertex-{TAG}.json"
IMAGE_URI = f"{REGION}-docker.pkg.dev/{PROJECT}/{ARTIFACT_REPO}/pdf-vector-search-pipeline:{TAG}"

In [None]:
!gcloud dataflow flex-template build {TEMPLATE_FILE} \
    --image-gcr-path {IMAGE_URI} \
    --sdk-language "PYTHON" \
    --flex-template-base-image "PYTHON3" \
    --py-path "." \
    --metadata-file "metadata.json" \
    --env "FLEX_TEMPLATE_PYTHON_PY_FILE=main.py" \
    --env "FLEX_TEMPLATE_PYTHON_REQUIREMENTS_FILE=requirements.txt"

In [None]:
print(TEMPLATE_FILE)