.github/workflows/ai-api-calls.yml

# This is a GitHub Action workflow that test AIGC, the pipeline has tested below steps:
# * Call OpenAI API to generate a speaking script per a prompt request from json input
# * Configure Workload Identity Federation to let Github Actions access GCP via OAUTH temporary token
# * Generate speaking audio via Google text-to-speech API
# * Call Studio.d-id API to generate a talk (mp3) speaking the above script
# * Get the url of the saved mp3 file
# * Call Video AI API to let the above person speak out the above script

name: AIGC Pipeline

# Controls when the workflow will run
on:
  # Triggers the workflow on push or pull request events but only for the main branch
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]

  # Allows you to run this workflow manually from the Actions tab
  workflow_dispatch:

env:
  INPUT_PATH: aigc-pipeline
  PROMPT_GPT_JSON: chatgpt_pmt2.json
  SPEAKING_SCRIPT_FILE: chatgpt-speak-script.txt
  OUTPUT_GCS: gs://chatgpt-output
  PROMPT_VOICE_JSON: gcp-voice.json
  SPEAKING_AUDIO_RAW_FILE: gcp-voice-raw.txt
  SPEAKING_AUDIO_MP3_FILE: gcp-voice-output.mp3
  PROMPT_D_ID_VOICE_JSON: d-id-voice.json
  PROMPT_D_ID_CLIP_JSON: d-id-clip.json

jobs:
  chatgpt:
    runs-on: ubuntu-latest

    # Add "id-token" with the intended permissions.
    permissions:
      contents: 'read'
      id-token: 'write'
          
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3

      - name: Show Prompt
        run: |
          cat $PROMPT_GPT_JSON | jq '.messages[0].content'

      # Call ChatGPT API to generate a speaking script from a prompt
      - name: Call chatgpt API
        id: chatgpt
        run: |
          curl --location --request POST 'https://api.openai.com/v1/chat/completions' \
            --header "Authorization: Bearer ${{ secrets.OPENAI_API_KEY }}" \
            --header 'Content-Type: application/json' \
            -d "@${INPUT_PATH}/$PROMPT_GPT_JSON" \
            --fail --silent --show-error --max-time 90 > resp-gpt.json

          jq -r '.choices[0].message.content' resp-gpt.json > "$SPEAKING_SCRIPT_FILE"

          if (( $(stat -c%s "$SPEAKING_SCRIPT_FILE") < 2 )); then
            echo "No Response from ChatGPT, please re-run the job!"
            exit 1
          fi

      - name: Update gcp voice input json
        run: |
          # cat ${INPUT_PATH}/${PROMPT_VOICE_JSON} | jq '.input.text'
          export SPEAKING_SCRIPT=$(cat $SPEAKING_SCRIPT_FILE)
          jq --arg newkey "${SPEAKING_SCRIPT}" '.input.text = $newkey' ${INPUT_PATH}/${PROMPT_VOICE_JSON} > tmp-file.json
          mv tmp-file.json ${INPUT_PATH}/${PROMPT_VOICE_JSON}

          echo 'Done json update'
          cat ${INPUT_PATH}/${PROMPT_VOICE_JSON} | jq .

      # Configure Workload Identity Federation and generate an access token.
      - id: 'auth'
        name: 'Authenticate to Google Cloud'
        uses: 'google-github-actions/auth@v1'
        with:
          token_format: 'access_token' # <--
          workload_identity_provider: "${{ secrets.GH_GCP_WORKLOAD_IDENTITY_PROVIDER }}"
          service_account: "${{ secrets.GH_GCP_SERVICE_ACCOUNT }}"
          access_token_lifetime: '300s' # optional, default: '3600s' (1 hour)

      # Generate audio Guide: https://cloud.google.com/text-to-speech/docs/create-audio-text-command-line
      # For demo only, below can process around 1-2 minutes audio, refer to this guide to generate long audio from text https://cloud.google.com/text-to-speech/docs/create-audio-text-command-line-long-audio-synthesis
      - name: Call GOOGLE text-to-speech API
        id: text_to_speech
        run: |
          curl -X POST \
            -H "Authorization: Bearer $(gcloud auth application-default print-access-token)" \
            -H "Content-Type: application/json; charset=utf-8" \
            -d @${INPUT_PATH}/${PROMPT_VOICE_JSON} \
            "https://texttospeech.googleapis.com/v1/text:synthesize" \
            --fail --silent --show-error --max-time 90 > resp-full-gcp-audio.json

          # export RESPONSE_GCP_AUDIO_RAW=$(cat resp-full-gcp-audio.json | jq -r '.audioContent')
          # export RESPONSE_GCP_AUDIO_RAW='This is a text file'

          jq -r '.audioContent' resp-full-gcp-audio.json | base64 -d > $SPEAKING_AUDIO_MP3_FILE
          
          # Check file size of the audio output
          if (( $(stat -c%s "$SPEAKING_AUDIO_MP3_FILE") < 5 )); then
            echo "No audio from Google Text-to-Speech, please re-run the job!"
            exit 1
          fi

      # Install gcloud, `setup-gcloud` automatically picks up authentication from `auth`.
      - name: 'Set up Cloud SDK'
        uses: 'google-github-actions/setup-gcloud@v1'

      # Upload file to gcs storage 
      - name : Upload file 
        run: |
          gsutil cp $SPEAKING_SCRIPT_FILE $OUTPUT_GCS
          gsutil cp $SPEAKING_AUDIO_MP3_FILE $OUTPUT_GCS
              
      # Now you can run gcloud commands authenticated as the impersonated service account.
      - id: 'gcloud'
        name: 'Check File from the Storage'
        run: |-
          gsutil cat $OUTPUT_GCS/$SPEAKING_SCRIPT_FILE
          echo "audio file: $OUTPUT_GCS/$SPEAKING_AUDIO_MP3_FILE"


      # Call Image AI API to generate a picture of a person from a prompt
      # Unfortunately, the official statement from Midjourney still is that they currently have no plans for an API in the near future.
      
      # Call Studio.d-id API to generate a talk speaking the above script
      # I paid the Lite plan on https://studio.d-id.com which can generate audio and animation, but not allowed to generate video clips, which is on Premium plan

      # # Tested, uncomment when got new subscription
      # - name: Update D-ID voice input json
      #   run: |
      #     # cat ${INPUT_PATH}/${PROMPT_D_ID_VOICE_JSON} | jq '.input.text'
      #     export SPEAKING_SCRIPT=$(cat $SPEAKING_SCRIPT_FILE)
      #     jq --arg newkey "${SPEAKING_SCRIPT}" '.script.input = $newkey' ${INPUT_PATH}/${PROMPT_D_ID_VOICE_JSON} > tmp-file.json
      #     mv tmp-file.json ${INPUT_PATH}/${PROMPT_D_ID_VOICE_JSON}

      #     echo 'Done json update'
      #     cat ${INPUT_PATH}/${PROMPT_D_ID_VOICE_JSON} | jq .

      # # Tested, uncomment when got new subscription
      # - name: Generate D_ID voice talk 
      #   run: |
      #     curl --request POST \
      #       --url https://api.d-id.com/talks \
      #       --header 'accept: application/json' \
      #       --header "authorization: Bearer ${{ secrets.D_ID_API_KEY }}" \
      #       --header 'content-type: application/json' \
      #       --data @${INPUT_PATH}/${PROMPT_D_ID_VOICE_JSON} > tmp-d-id-resp.json

      #     D_ID_TALK_STATUS=$(cat tmp-d-id-resp.json | jq '.status')
      #     if [[ "$D_ID_TALK_STATUS" != "created" ]]; then
      #       echo "API status error from D_ID Talk Task: $D_ID_TALK_STATUS"
      #       cat tmp-d-id-resp.json
      #       exit 1
      #     else
      #       D_ID_TALK_ID=$(cat tmp-d-id-resp.json | jq '.id')
      #       echo "Talk created, task id: $D_ID_TALK_ID"
      #     fi

      # # Tested, uncomment when got new subscription
      # - name: List D_ID voice talk 
      #   run: |
      #     curl --request GET \
      #     --url https://api.d-id.com/talks \
      #     --header 'accept: application/json' \
      #     --header "authorization: Bearer ${{ secrets.D_ID_API_KEY }}" > tmp-d-id-resp.json
      #     cat tmp-d-id-resp.json | jq '.talks[].audio_url'

      # # Tested, uncomment when got new subscription
      # - name: Update D-ID video input json
      #   run: |
      #     # cat ${INPUT_PATH}/${PROMPT_D_ID_CLIP_JSON} | jq '.input.text'
      #     export SPEAKING_SCRIPT=$(cat $SPEAKING_SCRIPT_FILE)
      #     jq --arg newkey "${SPEAKING_SCRIPT}" '.script.input = $newkey' ${INPUT_PATH}/${PROMPT_D_ID_CLIP_JSON} > tmp-file.json
      #     mv tmp-file.json ${INPUT_PATH}/${PROMPT_D_ID_CLIP_JSON}

      #     echo 'Done json update'
      #     cat ${INPUT_PATH}/${PROMPT_D_ID_CLIP_JSON} | jq .

      # # Tested via API document console, uncomment when got new subscription
      # # Call Video AI API to let the above person speak out that script from a prompt
      # I paid the Lite plan on https://studio.d-id.com which can generate audio and animation, but not allowed to generate video clips, which is on Premium plan
      # - name: Generate video 
      #     curl --request POST \
      #       --url https://api.d-id.com/clips \
      #       --header 'accept: application/json' \
      #       --header "authorization: Bearer ${{ secret.D_ID_API_KEY }}" \
      #       --header 'content-type: application/json' \
      #       --data @${INPUT_PATH}/${PROMPT_D_ID_CLIP_JSON}
     
      # Print out the link and password for user convenience
      # Need to verify the key path of '.clips[].clip_url'
      # - name : Print link of the video clips  
      #   run : |
      #     curl --request GET \
      #       --url https://api.d-id.com/clips \
      #       --header 'accept: application/json' \
      #       --header "authorization: Bearer ${{ secret.D_ID_API_KEY }}" > tmp-d-id-resp.json
      #       cat tmp-d-id-resp.json | jq '.clips[].clip_url'