.github/workflows/generate_cat_entry.yml

# This workflow runs on a merge/commit to the main branch
# The goal is to extract metadata for any *new* datasets/files contained in the diff, and add this to the catalog
# The first steps are:
# - set up git config
# - install git annex, python, datalad, metalad
# - TODO: install whichever extensions are needed for metadata extraction (this would be specific to the type of data contained in this overall repository+catalog)
# Then we find the diff and:
# - check if new datasets were added
# - check for updates to existing datasets
# Then extract metadata for new/updated datasets
# Then translate extracted metadata to catalog schema
# Then checkout catalog from gh-pages branch, and add translated metadata to catalog
# Then push updates back to gh-pages branch

name: generate_cat_entry

on:
  push:
    branches:
      - main

jobs:
  check:
    runs-on: ubuntu-latest
    outputs:
      run_job: ${{ steps.check_files.outputs.run_job }}
      subds: ${{ steps.check_files.outputs.subds }}
    steps:
      - name: Set up environment
        run: |
          git config --global user.email "test@github.land"
          git config --global user.name "GitHub Almighty"
      # - name: Install git annex
      #   shell: bash
      #   run: |
      #     bash <(wget -q -O- http://neuro.debian.net/_files/neurodebian-travis.sh)
      #     sudo apt-get update -qq
      #     sudo apt-get install eatmydata
      #     sudo eatmydata apt-get install git-annex-standalone
      - name: Set up Python 3.9
        uses: actions/setup-python@v1
        with:
          python-version: 3.9
      - name: Install datalad
        run: |
          python -m pip install --upgrade pip
          pip install --upgrade datalad
      - name: Checkout main
        uses: actions/checkout@v3
        with:
          ref: main
          fetch-depth: 0
      - name: check datalad diff
        id: check_files
        run: |
          echo "========== check datalad diff =========="
          datalad diff --from ${{ github.event.before }} --to ${{ github.event.after }}
          datalad_diffs=$(datalad diff --from ${{ github.event.before }} --to ${{ github.event.after }})
          echo "::set-output name=run_job::false"
          prefix="added: "
          suffix="(dataset)"
          while read -r line; do
            echo "changed item: $line"
            if [[ "$line" == *"$prefix"* && "$line" == *"$suffix"* ]]; then
              subds=${line#"$prefix"}
              subds=${subds%"$suffix"}
              echo "::set-output name=subds::$subds"
              echo "::set-output name=run_job::true"
              break
              # This currently assumes that there was a single update in the form of
              # an added subdataset. TODO: run multiple jobs or loop in case of multiple updates
            fi
          done <<< "$datalad_diffs"

  build:
    needs: check
    if: needs.check.outputs.run_job == 'true'
    runs-on: ubuntu-latest
    steps:
    - name: Echo some shit
      shell: bash
      run: |
        echo "got to do some shit for: ${{needs.check.outputs.subds}}"
    - name: Set up environment
      run: |
        git config --global user.email "test@github.land"
        git config --global user.name "GitHub Almighty"
    - name: Install git annex
      shell: bash
      run: |
        bash <(wget -q -O- http://neuro.debian.net/_files/neurodebian-travis.sh)
        sudo apt-get update -qq
        sudo apt-get install eatmydata
        sudo eatmydata apt-get install git-annex-standalone
    - name: Set up Python 3.9
      uses: actions/setup-python@v1
      with:
        python-version: 3.9
    - name: Checkout main
      uses: actions/checkout@v3
      with:
        ref: main
        fetch-depth: 0
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install --upgrade -r requirements.txt
    - name: Checkout datalad-catalog
      uses: actions/checkout@v3
      with:
        repository: datalad/datalad-catalog
        path: datalad-catalog
    - name: Install datalad-catalog
      run: |
        pip install -e datalad-catalog/
    - name: Checkout catalog branch
      uses: actions/checkout@v3
      with:
        ref: catalog
        path: catalogbranch
    - name: Run workflow for extraction+translation+catalog-update
      run: |
        # Get subdataset without data
        datalad get ${{needs.check.outputs.subds}} --no-data

        # Run workflow-update on new subdataset
        datalad catalog workflow-update -d . -s ${{needs.check.outputs.subds}} -c catalogbranch

        # Commit and push to catalog branch
        cd catalogbranch
        git add --all
        git commit -m "adds new subdataset to catalog"
        git push origin catalog