Skip to content

Scrape and Index Workflow Runs #1385

Scrape and Index Workflow Runs

Scrape and Index Workflow Runs #1385

Workflow file for this run

name: Scrape and Index Workflow Runs
on:
schedule:
- cron: "30 * * * *"
workflow_dispatch:
inputs:
name:
required: true
type: string
repository:
required: true
type: string
default: "cilium/cilium"
branch:
required: true
type: string
default: main
until:
required: true
type: string
since:
required: true
type: string
events:
required: false
type: string
default: push
env:
TARGET_INDEX: runs-oss
defaults:
run:
shell: bash
jobs:
setup-matrix:
runs-on: ubuntu-22.04
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- if: ${{ github.event_name == 'workflow_dispatch' }}
name: Prepare workflow_dispatch inputs
env:
INPUTS_JSON: ${{ toJSON(inputs) }}
run: |
echo $INPUTS_JSON | jq -c '{ "include": [.] }' > /tmp/matrix.json
- if: ${{ github.event_name == 'schedule' }}
name: Clone scheduled inputs
uses: actions/checkout@v4
with:
sparse-checkout: |
.github/scheduled.json
- if: ${{ github.event_name == 'schedule' }}
name: Prepare scheduled inputs
run: |
until=$(date '+%Y-%m-%dT%H')
since=$(date -d '-3 hours' '+%Y-%m-%dT%H')
m=$(jq -c '.include |= map(. + {"until": "'$until'", "since": "'$since'"})' < ./.github/scheduled.json)
echo $m > /tmp/matrix.json
- if: ${{ github.event_name != 'schedule' && github.event_name != 'workflow_dispatch' }}
run: |
exit 1
- id: set-matrix
run: |
cat /tmp/matrix.json
echo "matrix=$(cat /tmp/matrix.json)" >> $GITHUB_OUTPUT
scrape-and-index:
needs: setup-matrix
strategy:
# GitHub API doesn't like concurrent requests.
# Additionally, since our token is limited in the
# number of requests we can make, just do one
# repository at a time. If it takes longer that's ok
# since it reduces the chance we run into a limit.
max-parallel: 1
# Scrape jobs are independent, therefore one job's failure
# shouldn't impact another's.
fail-fast: false
matrix: ${{ fromJSON(needs.setup-matrix.outputs.matrix) }}
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: Echo Matrix
run: |
echo '${{ toJSON(matrix) }}'
- name: Scrape Workflow Runs
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
go run . workflow runs \
--repository ${{ matrix.repository }} \
--branch ${{ matrix.branch }} \
--until ${{ matrix.until }} \
--since ${{ matrix.since }} \
--events ${{ matrix.events }} \
--index ${{ env.TARGET_INDEX }} \
--verbose > results.json
- name: Bail if no results were found
id: bail
run: |
ls -lah results.json
if ! [ -s results.json ]; then
echo "No results, exiting early"
echo "bail=bail" >> $GITHUB_OUTPUT
fi
- name: Create splits
if: ${{ steps.bail.outputs.bail != 'bail' }}
run: |
mkdir splits
cd splits
split -l 200 ../results.json
ls -lah
cd ..
- name: Perform bulk requests
if: ${{ steps.bail.outputs.bail != 'bail' }}
working-directory: ./splits
run: |
set -x
for fn in ./*; do
curl -XPOST \
--data-binary @$fn \
--output $fn-response.json \
-H "Content-Type: application/json" \
--verbose \
-u ${{ secrets.OPENSEARCH_USER }}:${{ secrets.OPENSEARCH_PASS }} \
${{ vars.OPENSEARCH_URL }}/_bulk
done
- name: Upload results
if: ${{ steps.bail.outputs.bail != 'bail' }}
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.name }}
path: "splits/**"
- name: Fail if any items failed to index
if: ${{ steps.bail.outputs.bail != 'bail' }}
working-directory: ./splits
run: |
cat *.json | jq --exit-status '.errors | not'
cat *.json | jq --exit-status '.error | not'